{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset: Social media data \n",
    "\n",
    "https://github.com/abulbasar/data/blob/master/snsdata.csv?raw=true\n",
    "         \n",
    "- Use 36 features - \"basketball\", \"football\" … \"drunk\", \"drugs\" (these columns indicate how many times a user has used these words in her profile) and apply K-Means clustering to group the profiles into 5 clusters\n",
    "- Find the number of users in each cluster and mean distance with each cluster.\n",
    "- Which cluster is the most dense in terms of average distance.\n",
    "- How many anomalies are there?\n",
    "- For each cluster, find the top 3 dominant features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import *\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gradyear</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>friends</th>\n",
       "      <th>basketball</th>\n",
       "      <th>football</th>\n",
       "      <th>soccer</th>\n",
       "      <th>softball</th>\n",
       "      <th>volleyball</th>\n",
       "      <th>swimming</th>\n",
       "      <th>...</th>\n",
       "      <th>blonde</th>\n",
       "      <th>mall</th>\n",
       "      <th>shopping</th>\n",
       "      <th>clothes</th>\n",
       "      <th>hollister</th>\n",
       "      <th>abercrombie</th>\n",
       "      <th>die</th>\n",
       "      <th>death</th>\n",
       "      <th>drunk</th>\n",
       "      <th>drugs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2006</td>\n",
       "      <td>M</td>\n",
       "      <td>18.982</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2006</td>\n",
       "      <td>F</td>\n",
       "      <td>18.801</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2006</td>\n",
       "      <td>M</td>\n",
       "      <td>18.335</td>\n",
       "      <td>69</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2006</td>\n",
       "      <td>F</td>\n",
       "      <td>18.875</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2006</td>\n",
       "      <td>NaN</td>\n",
       "      <td>18.995</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 40 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   gradyear gender     age  friends  basketball  football  soccer  softball  \\\n",
       "0      2006      M  18.982        7           0         0       0         0   \n",
       "1      2006      F  18.801        0           0         1       0         0   \n",
       "2      2006      M  18.335       69           0         1       0         0   \n",
       "3      2006      F  18.875        0           0         0       0         0   \n",
       "4      2006    NaN  18.995       10           0         0       0         0   \n",
       "\n",
       "   volleyball  swimming  ...    blonde  mall  shopping  clothes  hollister  \\\n",
       "0           0         0  ...         0     0         0        0          0   \n",
       "1           0         0  ...         0     1         0        0          0   \n",
       "2           0         0  ...         0     0         0        0          0   \n",
       "3           0         0  ...         0     0         0        0          0   \n",
       "4           0         0  ...         0     0         2        0          0   \n",
       "\n",
       "   abercrombie  die  death  drunk  drugs  \n",
       "0            0    0      0      0      0  \n",
       "1            0    0      0      0      0  \n",
       "2            0    0      1      0      0  \n",
       "3            0    0      0      0      0  \n",
       "4            0    0      0      1      1  \n",
       "\n",
       "[5 rows x 40 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"https://github.com/abulbasar/data/blob/master/snsdata.csv?raw=true\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Proportion of male/female profiles in the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "F    0.735133\n",
       "M    0.174067\n",
       "Name: gender, dtype: float64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.gender.value_counts()/len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = df.columns[4:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjgAAAGoCAYAAABL+58oAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3X+0XWV95/H3JwmJCgpC4i+SkGhShsRVUTJIR0epOhLUTGyXtWS0/lhUKiPO2NppwdapruoMrik6tlIRxwwgEsygrVDTQQt1GBSV4OBIpIwRwURSgkJQ0BKD3/lj7xtOLvfmnuT+OJd936+17jrn7P2cvb/nYefeD8/znHNSVUiSJHXJrEEXIEmSNNEMOJIkqXMMOJIkqXMMOJIkqXMMOJIkqXMMOJIkqXMMONI0k2RLkpMHXccgJfm1JNuSPJDkuSPsf0GS77T7Xz2IGidakguSvHvQdUhdET8HR5o6Se4Afruq/q5n25vabS88gOMsAb4HHFJVeya2ysFL8l3g96rqc6Psvwa4sqo+PLWVSXqscARH0qMkmTPgEo4BthzM/jT83SbNcP4SkKaZJHckeVl7/8Qkm5P8OMndST7YNruuvd3VTtP8SpJZSf44yZ1Jdia5JMnhPcd9Q7vvR0nePew870lyRZJLk/wYeFN77huS7EqyI8lHksztOV4l+bftVNFPkvxpkme1z/lxko297Ye9xhFrTTIvyQPAbOCb7UjO8Od+F3gmcFX72ucl+VKS9yf5MvBT4Jnt8T7R1v6DJO9LMrs9xuwkf5bkh0luT/K29vXMGf7foKd/Lu15fFKSr7R9883eKcW2lj9N8uW2X76QZH7P/hf2PHdbO4JHkouSvK+n3auS3Ny2+0qSX+7Z94fta/pJktuSvHTEi0mawQw40vT2YeDDVfUk4FnAxnb7i9rbI6rqsKq6AXhT+/OrNAHgMOAjAElWAH8JvA54OnA4cPSwc60FrgCOAD4FPAz8LjAf+BXgpcC/Hfac1cAJwEnAHwAXtudYBDwbWDfK6xqx1qp6qKoOa9s8p6qeNfyJ7bbvA2va1/5Qu+u3gDOAJwJ3AhcDe4BlwHOBlwO/3bZ9C/Cqdvsq4DWj1PkoSY4GPg+8DzgS+H3gM0kW9DT7N8CbgacAc9s2JFkM/C3wF8AC4Hjg5hHO8TxgPfA7wFHAx4Ar2zB3LHAW8M+r6onAKcAd/dYvzRQGHGnq/XX7f+W7kuyiCR6j+TmwLMn8qnqgqr66n7avAz5YVbdX1QPAOcBp7ajEa4Crqur6qtoN/Edg+AK8G6rqr6vqF1X1s6q6qaq+WlV7quoOmj+yLx72nA9U1Y+ragtwC/CF9vz30/whf9QC4T5qPVgXVdWWdk3SkcCpwDuq6sGq2gl8CDitbfta4L9W1baquhf4zwdwntcDm6pqU9tXXwQ2A6/oafPfq+r/VdXPaELp8e321wF/V1UbqurnVfWjqnpUwKEJYB+rqq9V1cNVdTHwEE2QfBiYB6xIckhV3VFVjxrpkmY6A4409V5dVUcM/fDoUZFepwO/BPxDkhuTvGo/bZ9BM3Ix5E5gDvDUdt+2oR1V9VPgR8Oev633QZJfSvI3Sf6xnbb6TzSjOb3u7rn/sxEeH8bI9lfrweqt/xjgEGBHT5D8GM2IytD5e9v31jKWY4DfGBZSX0gzMjbkH3vu/5RH+mER0E8YOQZ457BzLAKeUVVbgXcA7wF2Jrk8yTMOoH5pRjDgSNNYVX2nqtbR/GH+AHBFkkN59OgLwF00fxiHLKaZorkb2AEsHNqR5PE0Ux/7nG7Y448C/wAsb6fI3gXk4F9N37UerN76t9GMeMzvCZNPqqqV7f4dNIGh9/y9HgSe0PP4acOO/cnekFpVh1bVuX3UuI1mqrGfdu8fdo4nVNUGgKq6rH3X3TE0r/sDfRxTmlEMONI0luT1SRZU1S+AXe3mh4F7gF/QrF8ZsgH43SRLkxxGM+Ly6XbK5gpgTZJ/0S78fS9jh5UnAj8GHkjyz4AzJ+yF7b/WcauqHcAXgPOSPKld1PysJENTbBuBf5dkYZInA2cPO8TNNFNmhyQZvkbnUpq+PKVdrPy4JCcnWcjYPgW8LMlrk8xJclSS40do93HgrUmen8ahSV6Z5IlJjk3ykiTzgH+iGSl7uO/OkWYIA440va0GtrTvLPowcFpV/VM7xfR+4MvtFMZJNItSP0nzDqvv0fzxeztAu0bm7cDlNKMXPwF20oxyjOb3aRbL/oTmD+6nJ/B1jVrrBHoDzQLfbwP30YS8oWmkjwNXA98EvgF8dthz300z0nIfTRi8bGhHVW2jWZD9LpqguQ34D/Tx+7Sqvk+zVuedwL00Qeo5I7TbTLMO5yNtDVtpFmVDs/7mXOCHNFNhT2lrkdTDD/qTZqB21GQXzfTT9wZdz6Cl4x+cKM1EjuBIM0SSNUme0K7h+TPgW/j2YkkdZcCRZo61NIt77wKW00x3OYQrqZOcopIkSZ3jCI4kSeqcQX+hHgDz58+vJUuWDLoMSZI0zd10000/rKoFY7UbaMBJsgZYs2zZMjZv3jzIUiRJ0mNAkr4+eXygU1RVdVVVnXH44YeP3ViSJKlPrsGRJEmdY8CRJEmdY8CRJEmdM+GLjJPMAv4UeBKwuaounuhzSJIk7U9fIzhJ1ifZmeSWYdtXJ7ktydYkQ9/GuxY4Gvg5sH1iy5UkSRpbvyM4F9F8q+0lQxuSzAbOB/4VTZC5McmVwLHADVX1sSRXANdMaMUHYcnZnx+zzR3nvnIKKpEkSVOhrxGcqroOuHfY5hOBrVV1e1XtBi6nGb3ZDtzXtnl4tGMmOSPJ5iSb77nnngOvXJIkaRTjWWR8NLCt5/H2dttngVOS/AVw3WhPrqoLgfcC35g7d+44ypAkSdrXeBYZZ4RtVVU/BU4fx3ElSZLGZTwjONuBRT2PFwJ3HcgB/CRjSZI0GcYTcG4ElidZmmQucBpw5YEcIMmaJBfef//94yhDkiRpX/2+TXwDcANwbJLtSU6vqj3AWcDVwK3AxqraMnmlSpIk9aevNThVtW6U7ZuATQd78qq6Crhq1apVbznYY0iSJA030K9qcIpKkiRNhoEGHBcZS5KkyeCXbUqSpM5xikqSJHWOU1SSJKlznKKSJEmd4xSVJEnqHKeoJElS5zhFJUmSOseAI0mSOsc1OJIkqXNcgyNJkjrHKSpJktQ5BhxJktQ5BhxJktQ5Ex5wkpyc5H8nuSDJyRN9fEmSpLH0FXCSrE+yM8ktw7avTnJbkq1Jzm43F/AA8Dhg+8SWK0mSNLZ+R3AuAlb3bkgyGzgfOBVYAaxLsgL431V1KvCHwHsnrlRJkqT+9BVwquo64N5hm08EtlbV7VW1G7gcWFtVv2j33wfMG+2YSc5IsjnJ5nvuuecgSpckSRrZnHE892hgW8/j7cDzk/w6cApwBPCR0Z5cVRcm2QGsmTt37gnjqEOSJGkf4wk4GWFbVdVngc+O47iSJEnjMp53UW0HFvU8XgjcdSAH8JOMJUnSZBhPwLkRWJ5kaZK5wGnAlQdyAL+LSpIkTYZ+3ya+AbgBODbJ9iSnV9Ue4CzgauBWYGNVbTmQkzuCI0mSJkNfa3Cqat0o2zcBmw725EnWAGuWLVt2sIeQJEl6FL9NXJIkdc5AA45rcCRJ0mRwBEeSJHWOIziSJKlzHMGRJEmdM9CAI0mSNBmcopIkSZ3jFJUkSeocp6gkSVLnGHAkSVLnGHAkSVLnuMhYkiR1jouMJUlS5zhFJUmSOseAI0mSOmdSAk6SQ5PclORVk3F8SZKk/ekr4CRZn2RnkluGbV+d5LYkW5Oc3bPrD4GNE1moJElSv/odwbkIWN27Icls4HzgVGAFsC7JiiQvA74N3D2BdUqSJPVtTj+Nquq6JEuGbT4R2FpVtwMkuRxYCxwGHEoTen6WZFNV/WL4MZOcAZwBsHjx4oOtX5Ik6VH6CjijOBrY1vN4O/D8qjoLIMmbgB+OFG4AqurCJDuANXPnzj1hHHVIkiTtYzyLjDPCttp7p+qiqvqb/R3Az8GRJEmTYTwBZzuwqOfxQuCuAzmAn2QsSZImw3gCzo3A8iRLk8wFTgOunJiyJEmSDl6/bxPfANwAHJtke5LTq2oPcBZwNXArsLGqthzIyZ2ikiRJk6Hfd1GtG2X7JmDTwZ48yRpgzbJlyw72EJIkSY/il21KkqTOGWjAcZGxJEmaDI7gSJKkznEER5IkdY4jOJIkqXMGGnAkSZImg1NUkiSpc5yikiRJneMUlSRJ6hwDjiRJ6hwDjiRJ6hwXGUuSpM5xkbEkSeocp6gkSVLnzBl0AdPFkrM/31e7O8595SRXIkmSxmvCR3CSHJfkgiRXJDlzoo8vSZI0lr4CTpL1SXYmuWXY9tVJbkuyNcnZAFV1a1W9FXgtsGriS5YkSdq/fkdwLgJW925IMhs4HzgVWAGsS7Ki3fevgeuBayasUkmSpD71FXCq6jrg3mGbTwS2VtXtVbUbuBxY27a/sqr+BfC60Y6Z5Iwkm5Nsvueeew6uekmSpBGMZ5Hx0cC2nsfbgecnORn4dWAesGm0J1fVhUl2AGvmzp17wjjqkCRJ2sd4Ak5G2FZV9SXgS/0coKquAq5atWrVW8ZRhyRJ0j7G8y6q7cCinscLgbsO5AB+krEkSZoM4wk4NwLLkyxNMhc4DbhyYsqSJEk6eP2+TXwDcANwbJLtSU6vqj3AWcDVwK3AxqraciAn96saJEnSZOhrDU5VrRtl+yb2s5B4LEnWAGuWLVt2sIeQJEl6FL9sU5Ikdc5AA46LjCVJ0mRwBEeSJHWOIziSJKlzHMGRJEmdM9CAI0mSNBmcopIkSZ3jFJUkSeocp6gkSVLnGHAkSVLnGHAkSVLnuMhYkiR1jouMJUlS5zhFJUmSOseAI0mSOmdSAk6SVyf5eJLPJXn5ZJxDkiRpNH0HnCTrk+xMcsuw7auT3JZka5KzAarqr6vqLcCbgN+c0IolSZLGcCAjOBcBq3s3JJkNnA+cCqwA1iVZ0dPkj9v9kiRJU2ZOvw2r6rokS4ZtPhHYWlW3AyS5HFib5FbgXOBvq+obIx0vyRnAGQCLFy8+8MoHZMnZn++r3R3nvnKSK5EkSaPpO+CM4mhgW8/j7cDzgbcDLwMOT7Ksqi4Y/sSqujDJDmDN3LlzTxhnHZIkSXuNd5FxRthWVfXnVXVCVb11pHDT09DPwZEkSRNuvAFnO7Co5/FC4K5+n+wnGUuSpMkw3oBzI7A8ydIkc4HTgCvHX5YkSdLBO5C3iW8AbgCOTbI9yelVtQc4C7gauBXYWFVb+j2mU1SSJGkyHMi7qNaNsn0TsOlgTp5kDbBm2bJlB/N0SZKkEfllm5IkqXMGGnBcZCxJkiaDIziSJKlzHMGRJEmd4wiOJEnqnPF+VYNG0c93Vvl9VZIkTQ6nqCRJUuc4RSVJkjpnoAFHkiRpMhhwJElS5xhwJElS57jIWJIkdY6LjCVJUuc4RSVJkjrHgCNJkjpnwgNOkmcm+USSKyb62JIkSf3o66sakqwHXgXsrKpn92xfDXwYmA38t6o6t6puB0434Iytn69zAL/SQZKkA9XvCM5FwOreDUlmA+cDpwIrgHVJVkxodZIkSQehr4BTVdcB9w7bfCKwtapur6rdwOXA2n5PnOSMJJuTbL7nnnv6LliSJGks41mDczSwrefxduDoJEcluQB4bpJzRntyVV1YVauqatWCBQvGUYYkSdK++lqDM4qMsK2q6kfAW/s6QLIGWLNs2bJxlCFJkrSv8YzgbAcW9TxeCNw1vnIkSZLGbzwB50ZgeZKlSeYCpwFXTkxZkiRJB6+vgJNkA3ADcGyS7UlOr6o9wFnA1cCtwMaq2nIgJ/erGiRJ0mToaw1OVa0bZfsmYNPBntw1OJIkaTL4ZZuSJKlzxvMuqnFzBKc//XzisZ92LEnSIxzBkSRJnTPQgJNkTZIL77///kGWIUmSOsYRHEmS1DkDDTiSJEmTwYAjSZI6xzU4kiSpc1yDI0mSOscpKkmS1DkGHEmS1Dl+knFH9PNpx+AnHkuSZgbX4EiSpM5xikqSJHWOAUeSJHXOhK/BSXIo8JfAbuBLVfWpiT6HJEnS/vQ1gpNkfZKdSW4Ztn11ktuSbE1ydrv514ErquotwL+e4HolSZLG1O8U1UXA6t4NSWYD5wOnAiuAdUlWAAuBbW2zhyemTEmSpP71FXCq6jrg3mGbTwS2VtXtVbUbuBxYC2ynCTl9H1+SJGkijWcNztE8MlIDTbB5PvDnwEeSvBK4arQnJzkDOANg8eLF4yhDB6Kfz8vp97Ny+v3snX70c87H+mf9PNbrl6SJ/Bsy2cYTcDLCtqqqB4E3j/XkqrowyQ5gzdy5c08YRx2SJEn7GM8U0nZgUc/jhcBd4ytHkiRp/MYTcG4ElidZmmQucBpw5YEcwE8yliRJk6Hft4lvAG4Ajk2yPcnpVbUHOAu4GrgV2FhVWw7k5EnWJLnw/vvvP9C6JUmSRtXXGpyqWjfK9k3ApgmtSJIkaZz8sk1JktQ5qarBnTxZA6wBfhP4ziSeaj7ww0k8fhfYR2Ozj8ZmH43NPhqbfTS2mdxHx1TVgrEaDTTgTJUkm6tq1aDrmM7so7HZR2Ozj8ZmH43NPhqbfTQ2P2lYkiR1jgFHkiR1zkwJOBcOuoDHAPtobPbR2OyjsdlHY7OPxmYfjWFGrMGRJEkzy0wZwZEkSTOIAUeSJHVOpwNOktVJbkuyNcnZg65nukhyR5JvJbk5yeZ225FJvpjkO+3tkwdd51RLsj7JziS39GwbsV/S+PP22vq/SZ43uMqnxij9854kP2ivpZuTvKJn3zlt/9yW5JTBVD21kixK8vdJbk2yJcm/b7d7HbX200deS60kj0vy9STfbPvove32pUm+1l5Hn26/B5Ik89rHW9v9SwZZ/3TR2YCTZDZwPnAqsAJYl2TFYKuaVn61qo7v+RyFs4Frqmo5cE37eKa5CFg9bNto/XIqsLz9OQP46BTVOEgX8ej+AfhQey0d3359C+2/tdOAle1z/rL9N9l1e4B3VtVxwEnA29q+8Dp6xGh9BF5LQx4CXlJVzwGOB1YnOQn4AE0fLQfuA05v258O3FdVy4APte1mvM4GHOBEYGtV3V5Vu4HLgbUDrmk6Wwtc3N6/GHj1AGsZiKq6Drh32ObR+mUtcEk1vgockeTpU1PpYIzSP6NZC1xeVQ9V1feArTT/JjutqnZU1Tfa+z+h+SLio/E62ms/fTSaGXcttdfDA+3DQ9qfAl4CXNFuH34dDV1fVwAvTZIpKnfa6nLAORrY1vN4O/v/RzSTFPCFJDclOaPd9tSq2gHNLyDgKQOrbnoZrV+8vh5xVju9sr5nanPG9087TfBc4Gt4HY1oWB+B19JeSWYnuRnYCXwR+C6wq6r2tE16+2FvH7X77weOmtqKp58uB5yR0qvviW+8oKqeRzM8/rYkLxp0QY9BXl+NjwLPohlG3wGc126f0f2T5DDgM8A7qurH+2s6wrYZ0U8j9JHXUo+qeriqjgcW0oxYHTdSs/Z2RvbRWLoccLYDi3oeLwTuGlAt00pV3dXe7gT+iuYfz91DQ+Pt7c7BVTitjNYvXl9AVd3d/iL+BfBxHpk6mLH9k+QQmj/cn6qqz7abvY56jNRHXksjq6pdwJdo1isdkWROu6u3H/b2Ubv/cPqfTu6sLgecG4Hl7arzuTSL1K4ccE0Dl+TQJE8cug+8HLiFpm/e2DZ7I/C5wVQ47YzWL1cCb2jfBXMScP/QFMRMMmy9yK/RXEvQ9M9p7bs7ltIsov36VNc31dp1D58Abq2qD/bs8jpqjdZHXkuPSLIgyRHt/ccDL6NZq/T3wGvaZsOvo6Hr6zXAteWn+DJn7CaPTVW1J8lZwNXAbGB9VW0ZcFnTwVOBv2rXn80BLquq/5nkRmBjktOB7wO/McAaByLJBuBkYH6S7cCfAOcycr9sAl5Bs+Dxp8Cbp7zgKTZK/5yc5Hia4fA7gN8BqKotSTYC36Z518zbqurhQdQ9xV4A/BbwrXb9BMC78DrqNVofrfNa2uvpwMXtu8VmARur6m+SfBu4PMn7gP9DExRpbz+ZZCvNyM1pgyh6uvGrGiRJUud0eYpKkiTNUAYcSZLUOQYcSZLUOQYcSZLUOQYcSZLUOQYcSZLUOQYcSZLUOQYcSZLUOQYcSZLUOQYcSZLUOQYcSZLUOQYcSZLUOQYcSQAkuaj9luKpONevJdmW5IEkzx1h/wuSfKfd/+qpqGmyJbkgybsHXYc0U/ht4tI0k+QO4KnAw8DPga8Ab62qbZN83ouA7VX1x320LWB5VW09yHN9F/i9qvrcKPuvAa6sqg8fzPElyREcaXpaU1WHAU8H7gb+YsD1TLRjgC0Hsz8Nf3dJ2i9/SUjTWFX9E3AFsGJoW5LDk1yS5J4kdyb546E/+Ek+muSKnrYfSHJNGwpOTrI9ybuS/DDJHUleN9q5k7wlydYk9ya5Mskz2u3XtU2+2U4h/eYIz53V1nVnkp1tvYcnmZfkAWB2+/zvjvDc7wLPBK5qjz8vyZeSvD/Jl4GfAs9sj/eJJDuS/CDJ+5LMbo8xO8mfta/z9iRvS1JJ5rT770jysp5zvifJpT2PT0rylSS7knwzyck9+76U5E+TfDnJT5J8Icn8nv0v7HnutiRvarfvMwWY5FVJbm7bfSXJL/fs+8P2Nf0kyW1JXjrafydJIzPgSNNYkicAvwl8tWfzXwCH04SAFwNvAN7c7nsn8MtJ3pTkXwKnA2+sR+ainwbMB44G3ghcmOTYEc77EuA/A6+lGUW6E7gcoKpe1DZ7TlUdVlWfHqH0N7U/v9rWeRjwkap6qB2ZGnr+s4Y/sd32fdpRrKp6qN31W8AZwBPbei4G9gDLgOcCLwd+u237FuBV7fZVwGtGqHFESY4GPg+8DzgS+H3gM0kW9DT7NzR9/hRgbtuGJIuBv6X5b7QAOB64eYRzPA9YD/wOcBTwMeDKNswdC5wF/POqeiJwCnBHv/VLahhwpOnpr5PsAn4M/Cvgv0AzMkETeM6pqp9U1R3AeTR//KmqnwKvBz4IXAq8vaq2Dzv2u9ug8b9o/pC/doTzvw5YX1XfaAPGOcCvJFnSZ/2vAz5YVbdX1QPt808bGkE5SBdV1Zaq2kMTPE4F3lFVD1bVTuBDwGlt29cC/7WqtlXVvTRhrV+vBzZV1aaq+kVVfRHYDLyip81/r6r/V1U/AzbSBBloXvffVdWGqvp5Vf2oqh4VcGgC2Meq6mtV9XBVXQw8BJxEs/ZqHrAiySFVdUdVPWqkS9L+GXCk6enVVXUEzR+6s4D/lWRo9GUuzQjGkDtpRmQAqKqvA7cDofnj2+u+qnpw2HOfMcL5n9F7jjak/Kj3PGPY5/nt/Tk0i6cPVu8i62OAQ4Ad7RTPLppRkKf0nL+3fW8tYzkG+I2h47bHfiHNSNaQf+y5/1OaESqARUA/YeQY4J3DzrEIeEa7cPsdwHuAnUkuH5oelNQ/A440jbX/d/9Zmv+rfyHwQ5p3Vh3T02wx8IOhB0neRhOM7gL+YNghn5zk0GHPvWuEU9/Ve472OUf1nmcM+zy/Pc8emgXTB6v3LZ/baEY85lfVEe3Pk6pqZbt/B01g6D1/rweBJ/Q8ftqwY3+y57hHVNWhVXVuHzVuAx417TZKu/cPO8cTqmoDQFVdVlUvpOnDAj7QxzEl9TDgSNNYuzh4LfBk4NaqephmVOb9SZ6Y5Bjg92imo0jySzRrR15PM231B0mOH3bY9yaZ267ReRXwP0Y49WXAm5Mcn2Qe8J+Ar7VTYtAElWfup/QNwO8mWZrksPb5n26nl8atqnYAXwDOS/KkdlHzs5K8uG2yEfh3SRYmeTJw9rBD3EwzZXZIkuFrdC4F1iQ5pV2s/Lg0C7QX9lHap4CXJXltkjlJjhqh/wE+Drw1yfPb/8aHJnll+9/02CQvafv9n4Cf0QRcSQfAgCNNT1elebfRj4H30ywUHnrb9NtpRiBuB66nCSPr2/UtlwIfqKpvVtV3gHcBn2z/WEIztXIfzQjLp2g+X+cfhp+8qq4B3g18hmY05Fk8sr4FmumTi9vplZHW8KwHPglcB3yP5g/12w+mI/bjDTTTdd+meU1X8Mg00seBq4FvAt8APjvsue+meU33Ae+l6UMA2s8bWkvTd/fQjLb8B/r4fVlV36dZq/NO4F6aIPWcEdptplmH85G2hq00i7KhGX07l2a07h9ppt3eNda5Je3LD/qTZoj2rc6XVlU/IxGd0i6O/h5wyESNIkma3hzBkSRJnWPAkSRJneMUlSRJ6hxHcCRJUueM51NFJ8z8+fNryZIlgy5DkiRNczfddNMPq2rBWO2mRcBZsmQJmzdvHnQZkiRpmkvS1yeTO0UlSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6x4AjSZI6Z2YEnPccPugKJEnSFJoZAUeSJM0oBhxJktQ5BhxJktQ5BhxJktQ5BhxJktQ5BhxJktQ5BhxJktQ5BhxJktQ5kxJwkqxPsjPJLZNx/AOsZe/tY+3nqKOOYsOGDWzYsIFnP/vZzJo1i8c97nEkYdasWSThkEMO4ahkAG5VAAAKAElEQVSjjmL27Nk8+9nPZsOGDXtf+/DnzZo161Fteg21H+lY/eo9xqJFi1i0aNG4jjeRtU0XXXotkjRk2v1uq6oJ/wFeBDwPuKWf9ieccEJNBqCAqj950t770+3nSU/at7a5c+cWUMuXL6/Zs2fX4x//+FqwYEH90R/9US1durRe8IIXFFAvfelLa/78+XXooYfW7Nmz68wzz6xrr722li5dWpdddllddtlltXTp0r3PO++882rJkiV7H1922WX79NVQ+2uvvbZ27969z7H61XuMSy65pJ72tKfV05/+9LrkkksO6ngTWdt00aXXIklDpvJ3G7C5+ski/TQ6mB9giQHnkZ8k+zw+7LDD9t5fuXLl3vvXXnttnXnmmZWkzjvvvAJqyZIltXLlyrr22mtr3rx5deaZZ9bKlStryZIltWTJkjrvvPNq3rx5VVV17bXX1sqVK/e2H7rt3Td026u33ZCR2u1P7zGG7vce40CPN5G1TRddei2SNGQqf7dN+4ADnAFsBjYvXrx4wjug7YRJCzinnHLK3vtvfetb996/9NJL+3r+TTfdtPf+nXfeuff+7t27a9euXQXUgw8+WEDNmjWrZs2aVbt37y6gdu3aVbNmzaokNWvWrL3tqqp27969T/uh2959Q7e9etsNGand/vQeY+h+7zEO9HgTWdt00aXXIklDpvJ3W78BZ2CLjKvqwqpaVVWrFixYMKgyDtrVV1+99/4FF1yw9/7rX//6EdsPrQUa8uIXv3jv/Ve84hV7719//fWcc845JNl73MWLF3Pcccdx/fXXM2/ePM455xyOO+44jjnmGBYvXswFF1zAvHnz9j7/uOOO29t+6LZ339Btr952vbUMb7c/vccYut97jAM93kTWNl106bVI0pBp+butnxR0MD84RdXXj2twDuy4j/V1K116LZI0xDU4Uxxw2o6Y1gFnfz9HHnnk3rCycuXKSlLz5s0reGRNz5w5c+rII4+sWbNm1cqVK/e5mIY/L8mj2vQaaj/SsfrVe4yFCxfWwoULx3W8iaxtuujSa5GkIVP1u63fgJOm7cRKsgE4GZgP3A38SVV9YrT2q1atqs2bN094HXu953B4z/2Td3xJkjQlktxUVavGajdnMk5eVesm47iSJEn98JOMJUlS5xhwJElS5xhwJElS5xhwJElS5xhwJElS5xhwJElS5xhwJElS5xhwJElS58yMgOOnGEuSNKPMjIAjSZJmFAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqHAOOJEnqnFTVoGsgyT3AnZN4ivnADyfx+F1gH43NPhqbfTQ2+2hs9tHYZnIfHVNVC8ZqNC0CzmRLsrmqVg26junMPhqbfTQ2+2hs9tHY7KOx2Udjc4pKkiR1jgFHkiR1zkwJOBcOuoDHAPtobPbR2OyjsdlHY7OPxmYfjWFGrMGRJEkzy0wZwZEkSTOIAUeSJHVOpwNOktVJbkuyNcnZg65nukhyR5JvJbk5yeZ225FJvpjkO+3tkwdd51RLsj7JziS39GwbsV/S+PP22vq/SZ43uMqnxij9854kP2ivpZuTvKJn3zlt/9yW5JTBVD21kixK8vdJbk2yJcm/b7d7HbX200deS60kj0vy9STfbPvove32pUm+1l5Hn04yt90+r328td2/ZJD1TxedDThJZgPnA6cCK4B1SVYMtqpp5Ver6viez1E4G7imqpYD17SPZ5qLgNXDto3WL6cCy9ufM4CPTlGNg3QRj+4fgA+119LxVbUJoP23dhqwsn3OX7b/JrtuD/DOqjoOOAl4W9sXXkePGK2PwGtpyEPAS6rqOcDxwOokJwEfoOmj5cB9wOlt+9OB+6pqGfChtt2M19mAA5wIbK2q26tqN3A5sHbANU1na4GL2/sXA68eYC0DUVXXAfcO2zxav6wFLqnGV4Ejkjx9aiodjFH6ZzRrgcur6qGq+h6wlebfZKdV1Y6q+kZ7/yfArcDReB3ttZ8+Gs2Mu5ba6+GB9uEh7U8BLwGuaLcPv46Grq8rgJcmyRSVO211OeAcDWzrebyd/f8jmkkK+EKSm5Kc0W57alXtgOYXEPCUgVU3vYzWL15fjzirnV5Z3zO1OeP7p50meC7wNbyORjSsj8Braa8ks5PcDOwEvgh8F9hVVXvaJr39sLeP2v33A0dNbcXTT5cDzkjp1ffEN15QVc+jGR5/W5IXDbqgxyCvr8ZHgWfRDKPvAM5rt8/o/klyGPAZ4B1V9eP9NR1h24zopxH6yGupR1U9XFXHAwtpRqyOG6lZezsj+2gsXQ4424FFPY8XAncNqJZpparuam93An9F84/n7qGh8fZ25+AqnFZG6xevL6Cq7m5/Ef8C+DiPTB3M2P5JcgjNH+5PVdVn281eRz1G6iOvpZFV1S7gSzTrlY5IMqfd1dsPe/uo3X84/U8nd1aXA86NwPJ21flcmkVqVw64poFLcmiSJw7dB14O3ELTN29sm70R+NxgKpx2RuuXK4E3tO+COQm4f2gKYiYZtl7k12iuJWj657T23R1LaRbRfn2q65tq7bqHTwC3VtUHe3Z5HbVG6yOvpUckWZDkiPb+44GX0axV+nvgNW2z4dfR0PX1GuDa8lN8mTN2k8emqtqT5CzgamA2sL6qtgy4rOngqcBftevP5gCXVdX/THIjsDHJ6cD3gd8YYI0DkWQDcDIwP8l24E+Acxm5XzYBr6BZ8PhT4M1TXvAUG6V/Tk5yPM1w+B3A7wBU1ZYkG4Fv07xr5m1V9fAg6p5iLwB+C/hWu34C4F14HfUarY/WeS3t9XTg4vbdYrOAjVX1N0m+DVye5H3A/6EJirS3n0yylWbk5rRBFD3d+FUNkiSpc7o8RSVJkmYoA44kSeocA44kSeocA44kSeocA44kSeocA44kSeocA44kSeqc/w8RanZpadw05QAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 576x432 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "X = df[features] * 1.0\n",
    "a = X.values.flatten()\n",
    "\n",
    "fig, _ = plt.subplots(2, 1, figsize = (8, 6))\n",
    "axes = fig.axes \n",
    "\n",
    "axes[0].hist(X.values.flatten(), bins = 50, log = True);\n",
    "axes[0].set_title(\"Histogram of frequencies\")\n",
    "axes[1].boxplot(a, vert = False);\n",
    "axes[1].set_title(\"Boxplot of frequencies\")\n",
    "plt.tight_layout()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1080000,)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many records are there for which count is greater than 20."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(52,)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a[a>20].shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Clip the count values beyond 50."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEAVJREFUeJzt3X+MnVldx/H3h8KAWXQRFoxpO7ZkmobGKGSbLgmarITg1N2hhKh01ARMsw3EGowaKcbEoCGBf1SQmk2FpvwBrc3Kj66MqQTZFJMG2wUMLXVDbVY7tqGLC1XUsBa+/jF3ZTLOTO+de+/c3tP3K9l0njP3Oc/37N797tnvc57zpKqQJLXrOaMOQJI0XCZ6SWqciV6SGmeil6TGmeglqXEmeklqnIlekhpnopekxpnoJalxzx11AAD33HNPbdmyZdRhSNJYefzxx79RVS+91edui0S/ZcsWzp07N+owJGmsJPnnbj5n6UaSGmeil6TGmeglqXEmeklq3MBvxiZ5DvCHwA8B56rqI4O+hiSpe13N6JMcSXI9yfkl7dNJnkhyKcnBTvMeYCPwP8D8YMOVJPWq29LNUWB6cUOSDcAhYDewA5hNsgPYDpypqt8E3j64UCVJa9FVoq+q08DTS5p3AZeq6nJVPQMcZ2E2Pw98s/OZ767UZ5L9Sc4lOffUU0/1HrkkqSv91Og3AlcWHc8D9wHvB/40yU8Dp1c6uaoOA4cBdu7cueYX1245+Oll25987wNr7VKSmtJPos8ybVVV/wXs66qDZAaYmZqa6iMMSdJq+lleOQ9sXnS8CbjaSwdV9WhV7b/77rv7CEOStJp+Ev1ZYFuSrUkmgL3AyV46SDKT5PCNGzf6CEOStJpul1ceA84A25PMJ9lXVTeBA8Ap4CJwoqou9HJxZ/SSNHxd1eiranaF9jlgbq0Xt0YvScM30i0QnNFL0vCNNNFbo5ek4XNGL0mNc0YvSY1zRi9JjXM/eklqnIlekhpnjV6SGmeNXpIaZ+lGkhpnopekxlmjl6TGWaOXpMZZupGkxpnoJalxJnpJapyJXpIa56obSWqcq24kqXGWbiSpcSZ6SWqciV6SGmeil6TGDTzRJ7k/yeeTPJzk/kH3L0nqTVeJPsmRJNeTnF/SPp3kiSSXkhzsNBfwbeAFwPxgw5Uk9arbGf1RYHpxQ5INwCFgN7ADmE2yA/h8Ve0G3gm8e3ChSpLWoqtEX1WngaeXNO8CLlXV5ap6BjgO7Kmq73V+/03g+QOLVJK0Js/t49yNwJVFx/PAfUneBPws8CLggyudnGQ/sB9gcnKyjzAkSavpJ9Fnmbaqqo8DH7/VyVV1OMk1YGZiYuLePuJY1paDn162/cn3PjDoS0nSba2fVTfzwOZFx5uAq7104BYIkjR8/ST6s8C2JFuTTAB7gZO9dOCmZpI0fN0urzwGnAG2J5lPsq+qbgIHgFPAReBEVV3o5eLO6CVp+Lqq0VfV7Artc8DcWi+eZAaYmZqaWmsXkqRbcJtiSWqcLx6RpMY5o5ekxrl7pSQ1ztKNJDXO0o0kNc7SjSQ1ztKNJDXO0o0kNc7SjSQ1zkQvSY2zRi9JjbNGL0mNs3QjSY3r51WCY2mlVwyCrxmU1CZn9JLUOBO9JDXOVTeS1DhX3UhS4yzdSFLjTPSS1DgTvSQ1zkQvSY0bSqJPcleSx5M8OIz+JUnd6yrRJzmS5HqS80vap5M8keRSkoOLfvVO4MQgA5UkrU23M/qjwPTihiQbgEPAbmAHMJtkR5LXAV8Fvj7AOCVJa9TVXjdVdTrJliXNu4BLVXUZIMlxYA/wQuAuFpL/fyeZq6rvDSxiSVJP+tnUbCNwZdHxPHBfVR0ASPJW4BsrJfkk+4H9AJOTk32EIUlaTT+JPsu01f/9UHV0tZOr6nCSa8DMxMTEvX3EIUlaRT+Jfh7YvOh4E3C1lw6q6lHg0Z07dz7URxwDs9IWxm5fLGmc9bO88iywLcnWJBPAXuBkLx24qZkkDV+3yyuPAWeA7Unmk+yrqpvAAeAUcBE4UVUXerm4m5pJ0vB1u+pmdoX2OWBurRdPMgPMTE1NrbULSdItuE2xJDXOF49IUuOc0UtS49y9UpIaZ+lGkhpn6UaSGmfpRpIa188WCH0bl3X0bo0gaZxZupGkxlm6kaTGmeglqXEur5Skxlmjl6TGWbqRpMaZ6CWpcSZ6SWqciV6SGueTsX3wiVlJ48BVN5LUOEs3ktQ4E70kNc5EL0mNM9FLUuMGnuiTvCLJw0keSfL2QfcvSepNV4k+yZEk15OcX9I+neSJJJeSHASoqotV9TbgF4Gdgw9ZktSLbmf0R4HpxQ1JNgCHgN3ADmA2yY7O794A/B3w2YFFKklak64emKqq00m2LGneBVyqqssASY4De4CvVtVJ4GSSTwMfG1y448EHqSTdTvp5MnYjcGXR8TxwX5L7gTcBzwfmVjo5yX5gP8Dk5GQfYUiSVtNPos8ybVVVjwGP3erkqjqc5BowMzExcW8fcUiSVtHPqpt5YPOi403A1V46cAsESRq+fhL9WWBbkq1JJoC9wMleOvBVgpI0fN0urzwGnAG2J5lPsq+qbgIHgFPAReBEVV3o5eLO6CVp+LpddTO7Qvscq9xwvZVx36ZYksaB2xRLUuN88cg6cn29pFFwRi9JjXP3Sklq3EgTvcsrJWn4LN1IUuMs3UhS4yzdSFLjLN1IUuMs3UhS40b6wJQW+CCVpGGyRi9JjbNGL0mNs0YvSY0z0UtS40z0ktQ4V93cxlyNI2kQXHUjSY1z1Y0kNc4avSQ1zkQvSY0z0UtS41x1M4ZcjSOpF0OZ0Sd5Y5I/T/KpJK8fxjUkSd3pOtEnOZLkepLzS9qnkzyR5FKSgwBV9cmqegh4K/DmgUYsSepJLzP6o8D04oYkG4BDwG5gBzCbZMeij/xe5/eSpBHpOtFX1Wng6SXNu4BLVXW5qp4BjgN7suB9wF9X1RcHF64kqVf93ozdCFxZdDwP3Af8OvA64O4kU1X18NITk+wH9gNMTk72GYbAm7SSltdvos8ybVVVHwA+sNqJVXU4yTVgZmJi4t4+45AkraDfVTfzwOZFx5uAq92e7BYIkjR8/Sb6s8C2JFuTTAB7gZPdnuymZpI0fL0srzwGnAG2J5lPsq+qbgIHgFPAReBEVV3otk9n9JI0fF3X6KtqdoX2OWBuLRdPMgPMTE1NreV0SVIX3KZYkho30r1unNGvD5ddSnc2Z/SS1Di3KZakxvnOWElqnKUbSWqcpRtJapylG0lq3EiXV1bVo8CjO3fufGiUcdypXHYp3Rks3UhS43w5uP6flWb64GxfGkfW6CWpcS6vlKTGWaOXpMaZ6CWpcSZ6SWqcq27UE9feS+PHVTeS1DhX3UhS4yzdaCB6LelYApLWjzdjJalxJnpJapyJXpIaN/BEn+TlST6c5JFB9y1J6l1XN2OTHAEeBK5X1Y8vap8G3g9sAD5UVe+tqsvAPhO9YPWdMCWtj25n9EeB6cUNSTYAh4DdwA5gNsmOgUYnSepbV4m+qk4DTy9p3gVcqqrLVfUMcBzYM+D4JEl96qdGvxG4suh4HtiY5CVJHgZeleRdK52cZH+Sc0nOPfXUU32EIUlaTT8PTGWZtqqqfwPedquTq+pwkmvAzMTExL19xCFJWkU/M/p5YPOi403A1V46cAsESRq+fmb0Z4FtSbYC/wrsBX6plw6SzAAzU1NTfYShO4FbLEhr19WMPskx4AywPcl8kn1VdRM4AJwCLgInqupCLxd3Ri9Jw9fVjL6qZldonwPm1npxZ/Rqkf83oduN2xRLUuNGuk2xM3ot1euTtD55q3Exyv/Tc0YvSY1z90pJapylG91RvFGqO5GlG0lqnKUbSWrcSBN9kpkkh2/cuDHKMCSpaZZuJKlxlm4kqXEmeklqnMsrJda27HJQT+W65FPDZo1ekhpn6UaSGmeil6TGmeglqXEmeklqnKtupNvUuKzGWW310e0W653KVTeS1DhLN5LUOBO9JDXORC9JjTPRS1LjBr7qJsldwJ8BzwCPVdVHB30NSVL3uprRJzmS5HqS80vap5M8keRSkoOd5jcBj1TVQ8AbBhyvJKlH3ZZujgLTixuSbAAOAbuBHcBskh3AJuBK52PfHUyYkqS16irRV9Vp4OklzbuAS1V1uaqeAY4De4B5FpJ91/1Lkoannxr9Rr4/c4eFBH8f8AHgg0keAB5d6eQk+4H9AJOTk32EIQ3PoPacH6RBPTHbaz+D/Hsxqqd+ex1DK0/29pPos0xbVdV/Ar96q5Or6nCSa8DMxMTEvX3EIUlaRT+llXlg86LjTcDVXjpwCwRJGr5+Ev1ZYFuSrUkmgL3AyV46SDKT5PCNGzf6CEOStJpul1ceA84A25PMJ9lXVTeBA8Ap4CJwoqou9HJxZ/SSNHxd1eiranaF9jlgbq0Xd5tiSRo+tymWpMaNNNFbo5ek4XNGL0mN88lVSWpcqmp0F+/cjAXeDHxtjd3cA3xjYEGNB8d8Z3DMd4Z+xvxjVfXSW31opIl+EJKcq6qdo45jPTnmO4NjvjOsx5gt3UhS40z0ktS4FhL94VEHMAKO+c7gmO8MQx/z2NfoJUmra2FGL0laxVgn+hXeWduU5d7Xm+TFST6T5GudP394lDEOUpLNST6X5GKSC0ne0WlvecwvSPL3Sf6hM+Z3d9q3JvlCZ8x/0dkltilJNiT5UpK/6hw3PeYkTyb5SpIvJznXaRv6d3tsE/0q76xtzVGWvK8XOAh8tqq2AZ/tHLfiJvBbVfUK4NXAr3X+ubY85u8Ar62qnwReCUwneTXwPuCPO2P+JrBvhDEOyztY2P32WXfCmH+mql65aEnl0L/bY5voWfmdtU1Z4X29e4CPdH7+CPDGdQ1qiKrqWlV9sfPzf7CQBDbS9pirqr7dOXxe568CXgs80mlvaswASTYBDwAf6hyHxse8gqF/t8c50S/3ztqNI4plvf1IVV2DhcQIvGzE8QxFki3Aq4Av0PiYOyWMLwPXgc8A/wR8q/PeB2jz+/0nwO8A3+scv4T2x1zA3yR5vPPebFiH73Y/74wdtWXfWbvuUWgokrwQ+EvgN6rq3xcme+2qqu8Cr0zyIuATwCuW+9j6RjU8SR4ErlfV40nuf7Z5mY82M+aO11TV1SQvAz6T5B/X46LjPKPv+521Y+zrSX4UoPPn9RHHM1BJnsdCkv9oVX2809z0mJ9VVd8CHmPh/sSLkjw7GWvt+/0a4A1JnmSh7PpaFmb4LY+Zqrra+fM6C/9B38U6fLfHOdH3/c7aMXYSeEvn57cAnxphLAPVqdN+GLhYVX+06Fctj/mlnZk8SX4AeB0L9yY+B/x852NNjbmq3lVVm6pqCwv/7v5tVf0yDY85yV1JfvDZn4HXA+dZh+/2WD8wleTnWJgFbACOVNV7RhzSwHXe13s/CzvcfR34feCTwAlgEvgX4BeqaukN27GU5KeAzwNf4fu1299loU7f6ph/goWbcBtYmHydqKo/SPJyFma7Lwa+BPxKVX1ndJEOR6d089tV9WDLY+6M7ROdw+cCH6uq9yR5CUP+bo91opck3do4l24kSV0w0UtS40z0ktQ4E70kNc5EL0mNM9FLUuNM9JLUOBO9JDXufwFDwfy5yPtqfQAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "X_clipped = np.clip(X.values, a_min=0, a_max=50)\n",
    "plt.hist(X_clipped.flatten(), log=True, bins = 50);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Before applying KMeans, standarized the values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = preprocessing.MinMaxScaler()\n",
    "X_std = scaler.fit_transform(X_clipped)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set the number of clusters to k=5."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "k = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n",
       "    n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',\n",
       "    random_state=1, tol=0.0001, verbose=0)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kmeans = cluster.KMeans(n_clusters=k, random_state=1)\n",
    "kmeans.fit(X_std)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Predict cluster for each point based on the KMeans model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = kmeans.predict(X_std)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Centroids of the clusters in the normal scale (using scaler.inverse_transform)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>basketball</th>\n",
       "      <td>2.185926e-01</td>\n",
       "      <td>0.449442</td>\n",
       "      <td>0.436441</td>\n",
       "      <td>0.412173</td>\n",
       "      <td>0.282163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>football</th>\n",
       "      <td>2.111973e-01</td>\n",
       "      <td>0.377216</td>\n",
       "      <td>0.549435</td>\n",
       "      <td>0.360555</td>\n",
       "      <td>0.273462</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>soccer</th>\n",
       "      <td>1.931401e-01</td>\n",
       "      <td>0.367695</td>\n",
       "      <td>0.264124</td>\n",
       "      <td>0.276194</td>\n",
       "      <td>0.249845</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>softball</th>\n",
       "      <td>1.303934e-01</td>\n",
       "      <td>0.288903</td>\n",
       "      <td>0.238701</td>\n",
       "      <td>0.251541</td>\n",
       "      <td>0.161591</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>volleyball</th>\n",
       "      <td>1.103398e-01</td>\n",
       "      <td>0.342416</td>\n",
       "      <td>0.211864</td>\n",
       "      <td>0.181818</td>\n",
       "      <td>0.122436</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>swimming</th>\n",
       "      <td>1.041695e-01</td>\n",
       "      <td>0.267892</td>\n",
       "      <td>0.189266</td>\n",
       "      <td>0.202234</td>\n",
       "      <td>0.162213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cheerleading</th>\n",
       "      <td>3.017104e-02</td>\n",
       "      <td>0.085686</td>\n",
       "      <td>2.888418</td>\n",
       "      <td>0.058937</td>\n",
       "      <td>0.046613</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>baseball</th>\n",
       "      <td>9.568531e-02</td>\n",
       "      <td>0.119829</td>\n",
       "      <td>0.185028</td>\n",
       "      <td>0.143297</td>\n",
       "      <td>0.106277</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tennis</th>\n",
       "      <td>7.817250e-02</td>\n",
       "      <td>0.136901</td>\n",
       "      <td>0.076271</td>\n",
       "      <td>0.108629</td>\n",
       "      <td>0.089497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sports</th>\n",
       "      <td>1.191870e-01</td>\n",
       "      <td>0.175312</td>\n",
       "      <td>0.168079</td>\n",
       "      <td>0.223421</td>\n",
       "      <td>0.210690</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cute</th>\n",
       "      <td>2.137834e-01</td>\n",
       "      <td>0.800394</td>\n",
       "      <td>0.576271</td>\n",
       "      <td>0.556626</td>\n",
       "      <td>0.424487</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sex</th>\n",
       "      <td>1.215916e-01</td>\n",
       "      <td>0.266907</td>\n",
       "      <td>0.300847</td>\n",
       "      <td>0.447612</td>\n",
       "      <td>0.839030</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sexy</th>\n",
       "      <td>1.156935e-01</td>\n",
       "      <td>0.186146</td>\n",
       "      <td>0.203390</td>\n",
       "      <td>0.208783</td>\n",
       "      <td>0.269111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hot</th>\n",
       "      <td>8.343542e-02</td>\n",
       "      <td>0.314183</td>\n",
       "      <td>0.251412</td>\n",
       "      <td>0.211479</td>\n",
       "      <td>0.170914</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>kissed</th>\n",
       "      <td>4.645887e-02</td>\n",
       "      <td>0.136573</td>\n",
       "      <td>0.189266</td>\n",
       "      <td>0.325886</td>\n",
       "      <td>0.420137</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dance</th>\n",
       "      <td>3.249399e-01</td>\n",
       "      <td>0.832239</td>\n",
       "      <td>0.668079</td>\n",
       "      <td>0.661017</td>\n",
       "      <td>0.540087</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>band</th>\n",
       "      <td>2.689079e-01</td>\n",
       "      <td>0.338805</td>\n",
       "      <td>0.261299</td>\n",
       "      <td>0.433359</td>\n",
       "      <td>0.436917</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>marching</th>\n",
       "      <td>3.978948e-02</td>\n",
       "      <td>0.040053</td>\n",
       "      <td>0.024011</td>\n",
       "      <td>0.053929</td>\n",
       "      <td>0.038533</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>music</th>\n",
       "      <td>6.345901e-01</td>\n",
       "      <td>0.984570</td>\n",
       "      <td>0.748588</td>\n",
       "      <td>1.137519</td>\n",
       "      <td>1.026725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rock</th>\n",
       "      <td>1.959076e-01</td>\n",
       "      <td>0.345043</td>\n",
       "      <td>0.360169</td>\n",
       "      <td>0.389445</td>\n",
       "      <td>0.413300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>god</th>\n",
       "      <td>4.086475e-01</td>\n",
       "      <td>0.561064</td>\n",
       "      <td>0.608757</td>\n",
       "      <td>0.610555</td>\n",
       "      <td>0.743319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>church</th>\n",
       "      <td>1.971780e-01</td>\n",
       "      <td>0.468155</td>\n",
       "      <td>0.365819</td>\n",
       "      <td>0.379815</td>\n",
       "      <td>0.266004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>jesus</th>\n",
       "      <td>1.033075e-01</td>\n",
       "      <td>0.149048</td>\n",
       "      <td>0.115819</td>\n",
       "      <td>0.135978</td>\n",
       "      <td>0.121815</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bible</th>\n",
       "      <td>1.851096e-02</td>\n",
       "      <td>0.021668</td>\n",
       "      <td>0.026836</td>\n",
       "      <td>0.036210</td>\n",
       "      <td>0.032940</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hair</th>\n",
       "      <td>2.422758e-01</td>\n",
       "      <td>0.754760</td>\n",
       "      <td>0.661017</td>\n",
       "      <td>1.065485</td>\n",
       "      <td>1.121193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dress</th>\n",
       "      <td>5.784674e-02</td>\n",
       "      <td>0.387722</td>\n",
       "      <td>0.146893</td>\n",
       "      <td>0.194530</td>\n",
       "      <td>0.164077</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>blonde</th>\n",
       "      <td>5.389955e-02</td>\n",
       "      <td>0.166776</td>\n",
       "      <td>0.289548</td>\n",
       "      <td>0.173344</td>\n",
       "      <td>0.211311</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mall</th>\n",
       "      <td>1.259471e-01</td>\n",
       "      <td>1.013132</td>\n",
       "      <td>0.418079</td>\n",
       "      <td>0.397535</td>\n",
       "      <td>0.330019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>shopping</th>\n",
       "      <td>1.407831e-01</td>\n",
       "      <td>1.778070</td>\n",
       "      <td>0.707627</td>\n",
       "      <td>0.439137</td>\n",
       "      <td>0.267247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>clothes</th>\n",
       "      <td>7.291390e-14</td>\n",
       "      <td>0.174327</td>\n",
       "      <td>0.217514</td>\n",
       "      <td>1.355162</td>\n",
       "      <td>0.156619</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hollister</th>\n",
       "      <td>3.039789e-02</td>\n",
       "      <td>0.253775</td>\n",
       "      <td>0.211864</td>\n",
       "      <td>0.151772</td>\n",
       "      <td>0.067744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>abercrombie</th>\n",
       "      <td>2.146001e-02</td>\n",
       "      <td>0.192055</td>\n",
       "      <td>0.158192</td>\n",
       "      <td>0.100924</td>\n",
       "      <td>0.064015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>die</th>\n",
       "      <td>1.455923e-01</td>\n",
       "      <td>0.223572</td>\n",
       "      <td>0.190678</td>\n",
       "      <td>0.287365</td>\n",
       "      <td>0.467371</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>death</th>\n",
       "      <td>9.305385e-02</td>\n",
       "      <td>0.155942</td>\n",
       "      <td>0.127119</td>\n",
       "      <td>0.167180</td>\n",
       "      <td>0.234307</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>drunk</th>\n",
       "      <td>-3.529121e-14</td>\n",
       "      <td>0.048260</td>\n",
       "      <td>0.086158</td>\n",
       "      <td>0.050847</td>\n",
       "      <td>1.428838</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>drugs</th>\n",
       "      <td>3.026178e-02</td>\n",
       "      <td>0.073867</td>\n",
       "      <td>0.081921</td>\n",
       "      <td>0.124037</td>\n",
       "      <td>0.336234</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         0         1         2         3         4\n",
       "basketball    2.185926e-01  0.449442  0.436441  0.412173  0.282163\n",
       "football      2.111973e-01  0.377216  0.549435  0.360555  0.273462\n",
       "soccer        1.931401e-01  0.367695  0.264124  0.276194  0.249845\n",
       "softball      1.303934e-01  0.288903  0.238701  0.251541  0.161591\n",
       "volleyball    1.103398e-01  0.342416  0.211864  0.181818  0.122436\n",
       "swimming      1.041695e-01  0.267892  0.189266  0.202234  0.162213\n",
       "cheerleading  3.017104e-02  0.085686  2.888418  0.058937  0.046613\n",
       "baseball      9.568531e-02  0.119829  0.185028  0.143297  0.106277\n",
       "tennis        7.817250e-02  0.136901  0.076271  0.108629  0.089497\n",
       "sports        1.191870e-01  0.175312  0.168079  0.223421  0.210690\n",
       "cute          2.137834e-01  0.800394  0.576271  0.556626  0.424487\n",
       "sex           1.215916e-01  0.266907  0.300847  0.447612  0.839030\n",
       "sexy          1.156935e-01  0.186146  0.203390  0.208783  0.269111\n",
       "hot           8.343542e-02  0.314183  0.251412  0.211479  0.170914\n",
       "kissed        4.645887e-02  0.136573  0.189266  0.325886  0.420137\n",
       "dance         3.249399e-01  0.832239  0.668079  0.661017  0.540087\n",
       "band          2.689079e-01  0.338805  0.261299  0.433359  0.436917\n",
       "marching      3.978948e-02  0.040053  0.024011  0.053929  0.038533\n",
       "music         6.345901e-01  0.984570  0.748588  1.137519  1.026725\n",
       "rock          1.959076e-01  0.345043  0.360169  0.389445  0.413300\n",
       "god           4.086475e-01  0.561064  0.608757  0.610555  0.743319\n",
       "church        1.971780e-01  0.468155  0.365819  0.379815  0.266004\n",
       "jesus         1.033075e-01  0.149048  0.115819  0.135978  0.121815\n",
       "bible         1.851096e-02  0.021668  0.026836  0.036210  0.032940\n",
       "hair          2.422758e-01  0.754760  0.661017  1.065485  1.121193\n",
       "dress         5.784674e-02  0.387722  0.146893  0.194530  0.164077\n",
       "blonde        5.389955e-02  0.166776  0.289548  0.173344  0.211311\n",
       "mall          1.259471e-01  1.013132  0.418079  0.397535  0.330019\n",
       "shopping      1.407831e-01  1.778070  0.707627  0.439137  0.267247\n",
       "clothes       7.291390e-14  0.174327  0.217514  1.355162  0.156619\n",
       "hollister     3.039789e-02  0.253775  0.211864  0.151772  0.067744\n",
       "abercrombie   2.146001e-02  0.192055  0.158192  0.100924  0.064015\n",
       "die           1.455923e-01  0.223572  0.190678  0.287365  0.467371\n",
       "death         9.305385e-02  0.155942  0.127119  0.167180  0.234307\n",
       "drunk        -3.529121e-14  0.048260  0.086158  0.050847  1.428838\n",
       "drugs         3.026178e-02  0.073867  0.081921  0.124037  0.336234"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "centroids = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=features)\n",
    "centroids.T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For first cluster, find top 10 the most dominant features based on the magnitude."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "music         0.634590\n",
       "god           0.408648\n",
       "dance         0.324940\n",
       "band          0.268908\n",
       "hair          0.242276\n",
       "basketball    0.218593\n",
       "cute          0.213783\n",
       "football      0.211197\n",
       "church        0.197178\n",
       "rock          0.195908\n",
       "Name: 0, dtype: float64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "centroids.iloc[0, :].T.sort_values(ascending = False)[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For first cluster, music, god, dance, hair etc. are dominant features. Let's see the dominant features in the other clusters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "shopping      1.778070\n",
       "mall          1.013132\n",
       "music         0.984570\n",
       "dance         0.832239\n",
       "cute          0.800394\n",
       "hair          0.754760\n",
       "god           0.561064\n",
       "church        0.468155\n",
       "basketball    0.449442\n",
       "dress         0.387722\n",
       "Name: 1, dtype: float64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "centroids.iloc[1, :].T.sort_values(ascending = False)[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "cheerleading    2.888418\n",
       "music           0.748588\n",
       "shopping        0.707627\n",
       "dance           0.668079\n",
       "hair            0.661017\n",
       "god             0.608757\n",
       "cute            0.576271\n",
       "football        0.549435\n",
       "basketball      0.436441\n",
       "mall            0.418079\n",
       "Name: 2, dtype: float64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "centroids.iloc[2, :].T.sort_values(ascending = False)[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "clothes       1.355162\n",
       "music         1.137519\n",
       "hair          1.065485\n",
       "dance         0.661017\n",
       "god           0.610555\n",
       "cute          0.556626\n",
       "sex           0.447612\n",
       "shopping      0.439137\n",
       "band          0.433359\n",
       "basketball    0.412173\n",
       "Name: 3, dtype: float64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "centroids.iloc[3, :].T.sort_values(ascending = False)[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "drunk     1.428838\n",
       "hair      1.121193\n",
       "music     1.026725\n",
       "sex       0.839030\n",
       "god       0.743319\n",
       "dance     0.540087\n",
       "die       0.467371\n",
       "band      0.436917\n",
       "cute      0.424487\n",
       "kissed    0.420137\n",
       "Name: 4, dtype: float64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "centroids.iloc[4, :].T.sort_values(ascending = False)[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As music and god are common in top 10 for each cluster, we can drop these features and retry the clustering. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find the density of each cluster. Calculate the avg distance of a point and its closes centroid."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gradyear</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>friends</th>\n",
       "      <th>basketball</th>\n",
       "      <th>football</th>\n",
       "      <th>soccer</th>\n",
       "      <th>softball</th>\n",
       "      <th>volleyball</th>\n",
       "      <th>swimming</th>\n",
       "      <th>...</th>\n",
       "      <th>shopping</th>\n",
       "      <th>clothes</th>\n",
       "      <th>hollister</th>\n",
       "      <th>abercrombie</th>\n",
       "      <th>die</th>\n",
       "      <th>death</th>\n",
       "      <th>drunk</th>\n",
       "      <th>drugs</th>\n",
       "      <th>cluster</th>\n",
       "      <th>distance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>16893</th>\n",
       "      <td>2008</td>\n",
       "      <td>F</td>\n",
       "      <td>16.364</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.046323</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18432</th>\n",
       "      <td>2008</td>\n",
       "      <td>F</td>\n",
       "      <td>16.690</td>\n",
       "      <td>23</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0.305214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2962</th>\n",
       "      <td>2006</td>\n",
       "      <td>M</td>\n",
       "      <td>17.960</td>\n",
       "      <td>51</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.043324</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6076</th>\n",
       "      <td>2006</td>\n",
       "      <td>F</td>\n",
       "      <td>18.319</td>\n",
       "      <td>54</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.109869</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14262</th>\n",
       "      <td>2007</td>\n",
       "      <td>M</td>\n",
       "      <td>18.215</td>\n",
       "      <td>14</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0.107005</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16646</th>\n",
       "      <td>2008</td>\n",
       "      <td>M</td>\n",
       "      <td>16.304</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.123572</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26571</th>\n",
       "      <td>2009</td>\n",
       "      <td>F</td>\n",
       "      <td>16.085</td>\n",
       "      <td>30</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.082439</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20107</th>\n",
       "      <td>2008</td>\n",
       "      <td>F</td>\n",
       "      <td>16.528</td>\n",
       "      <td>119</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>0.294885</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13933</th>\n",
       "      <td>2007</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.233036</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12905</th>\n",
       "      <td>2007</td>\n",
       "      <td>M</td>\n",
       "      <td>17.695</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.087391</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10 rows × 42 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       gradyear gender     age  friends  basketball  football  soccer  \\\n",
       "16893      2008      F  16.364        0           0         0       0   \n",
       "18432      2008      F  16.690       23           0         0       0   \n",
       "2962       2006      M  17.960       51           0         0       0   \n",
       "6076       2006      F  18.319       54           0         0       0   \n",
       "14262      2007      M  18.215       14           0         0       0   \n",
       "16646      2008      M  16.304        0           0         0       0   \n",
       "26571      2009      F  16.085       30           0         0       0   \n",
       "20107      2008      F  16.528      119           1         0       0   \n",
       "13933      2007    NaN     NaN        8           0         0       0   \n",
       "12905      2007      M  17.695       16           0         0       1   \n",
       "\n",
       "       softball  volleyball  swimming    ...     shopping  clothes  hollister  \\\n",
       "16893         0           0         0    ...            0        0          0   \n",
       "18432         1           0         0    ...            1        0          0   \n",
       "2962          0           0         0    ...            0        0          0   \n",
       "6076          0           0         1    ...            0        0          0   \n",
       "14262         0           0         0    ...            0        0          0   \n",
       "16646         0           0         0    ...            1        0          0   \n",
       "26571         0           0         0    ...            0        0          0   \n",
       "20107         1           0         0    ...            0        0          0   \n",
       "13933         0           0         1    ...            1        0          0   \n",
       "12905         0           0         0    ...            0        0          0   \n",
       "\n",
       "       abercrombie  die  death  drunk  drugs  cluster  distance  \n",
       "16893            0    0      0      0      0        0  0.046323  \n",
       "18432            0    0      0      1      0        4  0.305214  \n",
       "2962             0    0      0      0      0        0  0.043324  \n",
       "6076             0    0      0      0      0        0  0.109869  \n",
       "14262            0    0      0      1      0        4  0.107005  \n",
       "16646            0    0      1      0      0        0  0.123572  \n",
       "26571            0    0      0      0      0        0  0.082439  \n",
       "20107            0    1      0      2      1        4  0.294885  \n",
       "13933            0    0      0      0      0        1  0.233036  \n",
       "12905            0    0      0      0      0        0  0.087391  \n",
       "\n",
       "[10 rows x 42 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"cluster\"] = y_pred\n",
    "\n",
    "distances = np.zeros(len(y_pred))\n",
    "for i in range(k):\n",
    "    center = kmeans.cluster_centers_[i]\n",
    "    distances[y_pred == i] = metrics.euclidean_distances(X_std[y_pred == i]\n",
    "                                                        , center.reshape(1, -1)).squeeze()\n",
    "df[\"distance\"] = distances\n",
    "df.sample(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>gender</th>\n",
       "      <th>F</th>\n",
       "      <th>M</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.131036</td>\n",
       "      <td>0.126245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.281103</td>\n",
       "      <td>0.333438</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.260543</td>\n",
       "      <td>0.266780</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.231254</td>\n",
       "      <td>0.225874</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.232222</td>\n",
       "      <td>0.205003</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "gender          F         M\n",
       "cluster                    \n",
       "0        0.131036  0.126245\n",
       "1        0.281103  0.333438\n",
       "2        0.260543  0.266780\n",
       "3        0.231254  0.225874\n",
       "4        0.232222  0.205003"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.pivot_table(\"distance\", \"cluster\", \"gender\", aggfunc=\"mean\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's find the anomalies in the features depending the distance of the profile from its centroid. Using Box Whisker method, identify the outliers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_outliers(a):\n",
    "    q1, q2, q3 = np.percentile(a, [25, 50, 75])\n",
    "    iqr = q3 - q1\n",
    "    lower_whisker = max(q1 - 1.5 * iqr, np.min(a))\n",
    "    upper_whisker = min(q3 + 1.5 * iqr, np.max(a))\n",
    "    q1, q2, q3, iqr, lower_whisker, upper_whisker\n",
    "    is_outlier = (a < lower_whisker) | (a > upper_whisker)\n",
    "    return is_outlier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1543, 42), (30000, 42))"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "anamolies = df[find_outliers(df.distance)]\n",
    "anamolies.shape, df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x1a27bde860>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAD8CAYAAABthzNFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAE9dJREFUeJzt3X+sZ3V95/Hny1FEqEXsYGqAcWCH0I5NG+kt7tpftnXTQTqg9hcTN6nulClt6W7TZCMWo6ZJUzfp1pZI444uodouFHFroI6hWLWkVYTBqoCITkdaJtMUlO5QfxQKffeP77nw9fbcuefO3M/9fs/M85HccL6f8+P75syZed3P+ZwfqSokSVrqGbMuQJI0nwwISVIvA0KS1MuAkCT1MiAkSb0MCElSLwNCktTLgJAk9TIgJEm9njnrAo7Gxo0ba/PmzbMuQ5JG5a677vpyVZ220nJzFRBJTgZuA95SVX+60vKbN29m79697QuTpGNIkr8dslzTU0xJrknyUJJ7lrRvS3J/kn1Jrpia9QbghpY1SZKGaT0GcS2wbbohyQbgauACYCuwI8nWJK8APgf8Q+OaJEkDND3FVFW3Jdm8pPl8YF9V7QdIcj1wMfAtwMlMQuMbSfZU1b+2rE+StLxZjEGcDjw49fkA8NKquhwgyeuALy8XDkl2AbsANm3a1LZSSTqOzeIy1/S0PfVSiqq69nAD1FW1u6oWqmrhtNNWHISXJB2hWQTEAeDMqc9nAAdXs4Ek25PsPnTo0JoWJkl62iwC4k7gnCRnJTkBuAS4aTUbqKqbq2rXKaec0qRASVL7y1yvAz4BnJvkQJKdVfUEcDlwC3AfcENV3duyDknS6rW+imnHMu17gD1Hut0k24HtW7ZsOdJNsPmKD/a2P/C2C494m5J0LBnls5g8xSRJ7Y0yICRJ7Y0yILyKSZLaG2VAeIpJktobZUBIktqbq8d9D7UWVzEtx6ubJGlilD0ITzFJUnujDAhJUnsGhCSpl2MQAy03NgGOT0g6No2yB+EYhCS1N8qAkCS1Z0BIknoZEJKkXqMMCJ/FJEntjTIgHKSWpPZGGRCSpPZGeR/EvPH5TZKORfYgJEm9DAhJUq9RBoRXMUlSe6MMCK9ikqT2RhkQkqT2DAhJUi8DQpLUy4CQJPXyRrmGvIFO0pjZg5Ak9TIgJEm9RhkQ3ignSe2NMiC8UU6S2htlQEiS2jMgJEm9DAhJUi8DQpLUyxvlZsAb6CSNgT0ISVIvA0KS1MuAkCT1MiAkSb0MCElSr7kJiCTfmeSdSW5M8ouzrkeSjndNAyLJNUkeSnLPkvZtSe5Psi/JFQBVdV9VXQb8DLDQsi5J0spa9yCuBbZNNyTZAFwNXABsBXYk2drNuwj4S+DPG9clSVpB04CoqtuAR5Y0nw/sq6r9VfU4cD1wcbf8TVX1MuC1LeuSJK1sFndSnw48OPX5APDSJC8HXgM8G9iz3MpJdgG7ADZt2tSuyhnwDmtJ82QWAZGetqqqjwEfW2nlqtoN7AZYWFioNa1MkvSUWVzFdAA4c+rzGcDB1WzAN8pJUnuzCIg7gXOSnJXkBOAS4KbVbMA3yklSe60vc70O+ARwbpIDSXZW1RPA5cAtwH3ADVV17yq3aw9CkhpL1XhP4y8sLNTevXuPaN3lBoTHxMFrSUciyV1VteL9ZnNzJ7Ukab6MMiA8xSRJ7Y0yIByklqT2RhkQkqT2DAhJUq9RBoRjEJLU3iwetXHUqupm4OaFhYVLZ13LLPnsJkktjbIHIUlqz4CQJPUaZUA4BiFJ7Y0yILwPQpLaG2VASJLaMyAkSb0MCElSr1EGhIPUktTeKAPCQWpJam+UASFJam+Uj9rQ4fkIDklrwR6EJKmXASFJ6jXKgPAqJklqb5RjED7u+8g4NiFpNUbZg5AktTcoIJJ8V+tCJEnzZWgP4p1J7kjyS0me17QiSdJcGBQQVfUDwGuBM4G9Sf5vkv/ctDJJ0kwNHoOoqi8CbwLeAPwwcFWSzyd5TaviJEmzM3QM4ruTvB24D/hRYHtVfWc3/faG9UmSZmToZa7vAN4F/HpVfWOxsaoOJnlTk8okSTM1NCBeCXyjqp4ESPIM4MSq+npVvbdZdctIsh3YvmXLlvX+6mOS90dI6jN0DOLDwHOmPp/Utc2Ej/uWpPaGBsSJVfXVxQ/d9EltSpIkzYOhAfG1JOctfkjyvcA3DrO8JGnkho5B/CrwviQHu88vBH62TUmSpHkwKCCq6s4k3wGcCwT4fFX9S9PKJEkztZqnuX4fsLlb5yVJqKr3NKlKkjRzgwIiyXuB/wB8Gniyay7AgJCkY9TQHsQCsLWqqmUxkqT5MfQqpnuAb29ZiCRpvgztQWwEPpfkDuCxxcaquqhJVZKkmRsaEG9tWYTmk4/gkI5vQy9z/YskLwLOqaoPJzkJ2LDWxSR5FXAh8ALg6qr6s7X+DknSMEOvYroU2AU8n8nVTKcD7wR+bMC61wA/ATxUVd811b4N+D0mQfPuqnpbVX0A+ECSU4HfBgyIOWTPQjo+DB2k/mXg+4FH4amXB71g4LrXAtumG5JsAK4GLgC2AjuSbJ1a5E3dfEnSjAwNiMeq6vHFD0meyeQ+iBVV1W3AI0uazwf2VdX+brvXAxdn4n8CH6qqT/VtL8muJHuT7H344YcHli9JWq2hAfEXSX4deE73Lur3ATcfxfeeDjw49flA1/YrwCuAn0pyWd+KVbW7qhaqauG00047ihIkSYcz9CqmK4CdwN3ALwB7gHcfxfemp62q6irgqqPYriRpjQy9iulfmbxy9F1r9L0HgDOnPp8BHFxm2X/HN8pJUnuDTjEl+VKS/Ut/juJ77wTOSXJWkhOAS4Cbhq7sG+Ukqb3VPItp0YnATzO55HVFSa4DXg5sTHIAeEtV/Z8klwO3MLnM9Zqqundo0fYgJKm9oaeYvrKk6XeT/CXw5gHr7limfQ+TsYxVq6qbgZsXFhYuPZL1JUkrG3qj3HlTH5/BpEfx3CYVSZLmwtBTTP9ravoJ4AHgZ9a8moE8xSRJ7WXMr3hYWFiovXv3HtG6yz0uQm34GA5pfiS5q6oWVlpu6CmmXzvc/Kr6naGFSZLGYTVXMX0fT1+Kuh24jW++G1palg/4k8ZnNS8MOq+q/gkgyVuB91XVz7cq7HAcg5Ck9oY+i2kT8PjU58eBzWtezUDeKCdJ7Q3tQbwXuCPJnzB5iuurgfc0q0qSNHNDb5T7zSQfAn6wa3p9Vf11u7IkSbM29BQTwEnAo1X1e8CBJGc1qmlFSbYn2X3o0KFZlSBJx7yhD+t7C/AG4I1d07OAP2xV1Eocg5Ck9ob2IF4NXAR8DaCqDuKjNiTpmDY0IB6vyS3XBZDk5HYlSZLmwdCAuCHJ/wael+RS4MOs3cuDJElzaOhVTL/dvYv6UeBc4M1VdWvTyg7DG+Ukqb0VAyLJBuCWqnoFMLNQmOb7ICSpvRUDoqqeTPL1JKdUldeVak35jCZpfg29k/qfgbuT3Ep3JRNAVf23JlVJkmZuaEB8sPuR1oU9C2n2DhsQSTZV1d9V1R+sV0GSpPmw0mWuH1icSPL+xrUM5qM2JKm9lQIiU9NntyxkNXzUhiS1t1JA1DLTkqRj3EqD1N+T5FEmPYnndNN0n6uqvrVpdZKkmTlsQFTVhvUqRJI0X1bzPghJ0nHEgJAk9Rp6o5w0F7yBTlo/9iAkSb1G2YPwcd+aF/ZodCwbZQ/CG+Ukqb1R9iCkofwNXzpyo+xBSJLaMyAkSb0MCElSLwNCktTLgJAk9fIqJh0TlrtaSdKRswchSeplQEiSes3NKaYkZwNXAqdU1U/Nuh4d27yBTlpZ0x5EkmuSPJTkniXt25Lcn2RfkisAqmp/Ve1sWY8kabjWp5iuBbZNNyTZAFwNXABsBXYk2dq4DknSKjUNiKq6DXhkSfP5wL6ux/A4cD1wccs6JEmrN4sxiNOBB6c+HwBemuTbgN8EXpLkjVX1W30rJ9kF7ALYtGlT61olwMtodXyaRUCkp62q6ivAZSutXFW7gd0ACwsLtca1SZI6swiIA8CZU5/PAA6uZgO+MEit2FOQnjaL+yDuBM5JclaSE4BLgJtWswFfGCRJ7bW+zPU64BPAuUkOJNlZVU8AlwO3APcBN1TVvS3rkCStXtNTTFW1Y5n2PcCeI92up5g07w53qsqb8TQWo3zUhqeYJKm9UQaEJKm9UQZEku1Jdh86dGjWpUjSMWuUAeEpJklqb5QBIUlqb5QB4SkmSWpvlAHhKSZJam+UASFJas+AkCT1GmVAOAYhSe2NMiAcg5Ck9kYZEJKk9gwISVIvA0KS1GuUAeEgtSS1N8qAcJBaktobZUBIktozICRJvQwISVIvA0KS1MuAkCT1euasCzgSSbYD27ds2TLrUqSZ2XzFB3vbH3jbhetciY5Vo+xBeJmrJLU3yoCQJLVnQEiSehkQkqReBoQkqZcBIUnqZUBIknoZEJKkXt4oJ62z5W5wa719b6DTao2yB+GNcpLU3igDQpLUngEhSeplQEiSehkQkqReBoQkqZcBIUnqZUBIknoZEJKkXgaEJKnX3DxqI8nJwO8DjwMfq6o/mnFJknRca9qDSHJNkoeS3LOkfVuS+5PsS3JF1/wa4MaquhS4qGVdkqSVtT7FdC2wbbohyQbgauACYCuwI8lW4AzgwW6xJxvXJUlaQdOAqKrbgEeWNJ8P7Kuq/VX1OHA9cDFwgElINK9LkrSyWYxBnM7TPQWYBMNLgauAdyS5ELh5uZWT7AJ2AWzatKlhmdKxZa0eA77a7czj48fnsablzLLWWQREetqqqr4GvH6llatqN7AbYGFhoda4NklSZxancg4AZ059PgM4uJoNJNmeZPehQ4fWtDBJ0tNmERB3AuckOSvJCcAlwE2r2YAvDJKk9lpf5nod8Ang3CQHkuysqieAy4FbgPuAG6rq3pZ1SJJWr+kYRFXtWKZ9D7DnSLfrO6klqb1RXk7qKSZJam+UASFJam+UAeFVTJLU3igDwlNMktReqsZ7r1mSh4G/XWGxjcCX16Gc1ZrXumB+a5vXumB+a5vXumB+azse6npRVZ220kKjDoghkuytqoVZ17HUvNYF81vbvNYF81vbvNYF81ubdT1tlKeYJEntGRCSpF7HQ0DsnnUBy5jXumB+a5vXumB+a5vXumB+a7OuzjE/BiFJOjLHQw9CknQERhsQy7zXenr+s5P8cTf/k0k2T817Y9d+f5Ifn0Ftv5bkc0k+m+TPk7xoat6TST7d/azqKbdrUNfrkjw89f0/PzXv55J8sfv5ubWsa2Btb5+q6wtJ/v/UvJb7rPe96lPzk+Sqru7PJjlval6zfTagrtd29Xw2yceTfM/UvAeS3N3tr71rWdfA2l6e5NDUn9mbp+Yd9jhoXNf/mKrpnu64en43r9k+S3Jmko8muS/JvUn+e88yMznOqKrR/QAbgL8BzgZOAD4DbF2yzC8B7+ymLwH+uJve2i3/bOCsbjsb1rm2HwFO6qZ/cbG27vNXZ7jPXge8o2fd5wP7u/+e2k2fup61LVn+V4BrWu+zbts/BJwH3LPM/FcCH2LyIqz/CHxynfbZSnW9bPH7mLz//ZNT8x4ANs5wn70c+NOjPQ7Wuq4ly24HPrIe+wx4IXBeN/1c4As9fzdncpyNtQex3Hutp10M/EE3fSPwY0nStV9fVY9V1ZeAfd321q22qvpoVX29+3g7T7+Lu6Uh+2w5Pw7cWlWPVNU/ArcC22ZY2w7gujX8/mVV/3vVp10MvKcmbgeel+SFNN5nK9VVVR/vvhfW7xhb/O6V9tlyjuYYXeu61vMY+/uq+lQ3/U9MXoNw+pLFZnKcjTUg+t5rvXSHPrVMTd5BcQj4toHrtq5t2k4mvxksOjHJ3iS3J3nVDOr6ya4Le2OSxTf/zc0+607HnQV8ZKq51T4bYrnaW++z1Vh6jBXwZ0nuyuQd77Pwn5J8JsmHkry4a5uLfZbkJCb/yL5/qnld9lkmp8JfAnxyyayZHGezeCf1Wuh9r/XAZYasezQGbz/JfwEWgB+eat5UVQeTnA18JMndVfU361TXzcB1VfVYksuY9MB+dOC6rWtbdAlwY1U9OdXWap8NMavjbJAkP8IkIH5gqvn7u/31AuDWJJ/vfrteL59i8qiHryZ5JfAB4BzmZJ8xOb30V1U13dtovs+SfAuTUPrVqnp06eyeVZofZ2PtQQx5r/VTyyR5JnAKk+7lUb8Tew1qI8krgCuBi6rqscX2qjrY/Xc/8DEmv02sS11V9ZWpWt4FfO/QdVvXNuUSlnT9G+6zIZarvfU+W1GS7wbeDVxcVV9ZbJ/aXw8Bf8LanmJdUVU9WlVf7ab3AM9KspE52Gedwx1jTfZZkmcxCYc/qqr/17PIbI6zFoMurX+Y9Hz2MznVsDiY9eIly/wy3zxIfUM3/WK+eZB6P2s7SD2ktpcwGYw7Z0n7qcCzu+mNwBdZo0G6gXW9cGr61cDt9fRA2Je6+k7tpp+/nvusW+5cJoOFWY99NvUdm1l+wPVCvnnw8I712GcD6trEZHztZUvaTwaeOzX9cWDbWtY1oLZvX/wzZPIP7d91+2/QcdCqrm7+4i+SJ6/XPuv+398D/O5hlpnJcbamB8V6/jAZ1f8Ck39or+zafoPJb+QAJwLv6/6S3AGcPbXuld169wMXzKC2DwP/AHy6+7mpa38ZcHf3F+NuYOc61/VbwL3d938U+I6pdf9rty/3Aa9f733WfX4r8LYl67XeZ9cBfw/8C5Pf1nYClwGXdfMDXN3VfTewsB77bEBd7wb+ceoY29u1n93tq890f9ZXNvizXKm2y6eOs9uZCrG+42C96uqWeR2Ti1im12u6z5ic/ivgs1N/Xq+ch+PMO6klSb3GOgYhSWrMgJAk9TIgJEm9DAhJUi8DQpLUy4CQJPUyICRJvQwISVKvfwN3k3smFzhR9gAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "df.distance.plot.hist(bins = 50, log = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apply dendo-gram (hierarchical clustering) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.cluster.hierarchy import linkage, dendrogram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA20AAAJFCAYAAABOeh8lAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3Xu0ZVldH/rvj67mJSCGLihsoNsHMSItB7rTQhCpq2jAF0bQQHGBdugt4hWBIcEQjYCaoTH3hpsoCKMM2KIcNAGSYNI+AyUQrl668dAltJgW20sFCpqH0C0IaZ35Y61qdu/a5+xz6ux99qw6n88YZ1Ttvdbea8695np811yPaq0FAACAPt1p1QUAAABgc0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGMHVjXhiy66qF166aWrmjwAAMBKXXfddR9trR2cN97KQtull16aa6+9dlWTBwAAWKmq+vPtjOf0SAAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGMHVl2AZTl2LFlfX3UpgL125Ehy9OiqSwEAsDjnbU/b+nqysbHqUgB7aWPDwRoA4Pxz3va0JcnaWnL8+KpLAeyVw4dXXQIAgMU7b3vaAAAAzgdCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADo2NzQVlUPrKq3VNUNVfWeqnrujHEOV9Unq2pj/HvRcooLAACwvxzYxji3JXl+a+1dVXXPJNdV1e+01t47Nd7bWmvfuvgiAgAA7F9ze9paax9qrb1r/P8tSW5IcvGyCwYAAMAOr2mrqkuTPDzJH8wY/KiqendV/UZVfdUCygYAALDvbef0yCRJVd0jyRuSPK+19qmpwe9Kcklr7daq+uYk/zHJg2d8x9EkR5PkQQ960FkXGgAAYL/YVk9bVV2YIbC9trX2xunhrbVPtdZuHf9/TZILq+qiGeMda61d0Vq74uDBg7ssOgAAwPlvO3ePrCSvSnJDa+2lm4xzaBwvVXXl+L0fW2RBAQAA9qPtnB756CRPT3KiqjbG934kyYOSpLX2yiRPTvL9VXVbks8keUprrS2hvAAAAPvK3NDWWnt7kpozzsuSvGxRhQIAAGCwo7tHAgAAsLeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHTswKoLwOaOHUvW11ddCjh3bGwM/x4+vNJiwDnjyJHk6NFVlwKAefS0dWx9/fM7ocB8a2vDHzDfxoYDgwDnCj1tnVtbS44fX3UpADjf6JEGOHfoaQMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdmxvaquqBVfWWqrqhqt5TVc+dMU5V1c9W1Y1VdX1VPWI5xQUAANhfDmxjnNuSPL+19q6qumeS66rqd1pr750Y5wlJHjz+fU2SV4z/AgAAsAtze9paax9qrb1r/P8tSW5IcvHUaE9M8po2+P0k966q+y+8tAAAAPvMjq5pq6pLkzw8yR9MDbo4yQcmXp/MmcEuVXW0qq6tqmtvvvnmnZUUAABgH9p2aKuqeyR5Q5LntdY+NT14xkfaGW+0dqy1dkVr7YqDBw/urKQAAAD70LZCW1VdmCGwvba19sYZo5xM8sCJ1w9I8sHdFw8AAGB/287dIyvJq5Lc0Fp76SajvSnJM8a7SD4yySdbax9aYDkBAAD2pe3cPfLRSZ6e5ERVbYzv/UiSByVJa+2VSa5J8s1Jbkzy6STfs/iiAgAA7D9zQ1tr7e2Zfc3a5DgtyQ8sqlAAAAAMdnT3SAAAAPaW0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAxw6sugAAPTl23bGsn1hfdTFg6TZO/eskyeGrn7fiksDeOHLZkRy9/OiqiwFnRWgDmLB+Yj0bpzaydmht1UWBpVp7obDG/rFxaiNJhDbOWUIbwJS1Q2s5ftXxVRcDgAU5fPXhVRcBdsU1bQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQsQOrLgAAAFs7dt2xrJ9YX3UxzlkbpzaSJIevPrzagpzDjlx2JEcvP7rqYuxbetoAADq3fmL99uDBzq0dWsvaobVVF+OctXFqw0GDFdPTBgBwDlg7tJbjVx1fdTHYh/RQrp6eNgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjh1YdQGA5Nh1x7J+Yn3VxSDJxqmNJMnhqw+vtiDkyGVHcvTyo6suBgCsnJ426MD6ifXbwwKrtXZoLWuH1lZdjH1v49SGAxkAMNLTBp1YO7SW41cdX3UxoAt6OgHg8/S0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMfcPRLgPHM+PPfvfHlenmfNAbAIetoAzjPnw3P/zofn5XnWHACLoqcN4DzkuX+rd673EgLQDz1tAAAAHRPaAAAAOia0AQAAdExoAwAA6JgbkXDOc3vzfri9OQDA4ulp45zn9uZ9cHtzAIDl0NPGecHtzVfvXO8lBADolZ42AACAjgltAAAAHRPaAAAAOuaaNgD2pWXfeXav7grrrq0A5z89bQDsS8u+8+xe3BXWXVsB9gc9bQDsW+f6nWfdtRXOPz0+f7bX58nupzMN9LQBAEAnenz+bI/Pk91vZxroaQMAgI6c62cB7IXeev2WTU8bAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DHPaYN94th1x5b6EMrTDwJd9nNTjlx2JEcvP7rUaQAA9ERPG+wT6yfWbw9Wy7B2aC1rh9aW9v3JEAyXGTwBAHqkpw32kbVDazl+1fFVF+OsLbsXDwCgR3raAAAAOia0AQAAdGxuaKuqV1fVR6rqjzYZfriqPllVG+PfixZfTAAAgP1pO9e0XZ3kZUles8U4b2utfetCSgQAAMDt5va0tdbemuTje1AWAAAApizqmrZHVdW7q+o3quqrNhupqo5W1bVVde3NN9+8oEkDAACcvxYR2t6V5JLW2sOS/FyS/7jZiK21Y621K1prVxw8eHABkwYAADi/7Tq0tdY+1Vq7dfz/NUkurKqLdl0yAAAAdv9w7ao6lOTDrbVWVVdmCIIf23XJAAAW4Nh1x7J+Yn3VxdiVjVMbSZLDVx9ebUF26chlR3L08qOrLgacc+aGtqp6XZLDSS6qqpNJXpzkwiRprb0yyZOTfH9V3ZbkM0me0lprSysxAMAOrJ9Yz8apjawdWlt1Uc7auVz2004HT6ENdm5uaGutPXXO8JdleCQAAECX1g6t5fhVx1ddjH3tXO8lhFXa9emRAGzfXpymtRenUTnFCQD2zqJu+Q/ANpw+TWuZ1g6tLfVUqo1TG+f89UEAcC7R0wawx87107Sc4gQAe0tPGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4dWHUBOL8du+5Y1k+sL3UaG6c2kiSHrz681OkcuexIjl5+dKnTAACAaXraWKr1E+u3h6plWTu0lrVDa0udxsapjaWHTwAAmEVPG0u3dmgtx686vupi7Mqye/EAAGAzetoAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGMHVl0AAABW79h1x7J+Yn1p379xaiNJcvjqw0ubRpIcuexIjl5+dKnTgL2mpw0AgKyfWL89WC3D2qG1rB1aW9r3J0MwXGbwhFXR0wYAQJIhWB2/6viqi3HWlt2LB6uipw0AAKBjetoAAICFWfb1kcn+u0ZSTxsAALAwy74+Mtl/10jqaQMAABbqXL8+MunrGkk9bQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHTuw6gIAAPvXseuOZf3E+lKnsXFqI0ly+OrDS5vGkcuO5OjlR5f2/cD+pqcNAFiZ9RPrt4eqZVk7tJa1Q2tL+/6NUxtLD57A/qanDQBYqbVDazl+1fFVF+OsLbMHDyDR0wYAANA1oQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHZsb2qrq1VX1kar6o02GV1X9bFXdWFXXV9UjFl9MAACA/Wk7PW1XJ3n8FsOfkOTB49/RJK/YfbEAAABIthHaWmtvTfLxLUZ5YpLXtMHvJ7l3Vd1/UQUEAADYzxZxTdvFST4w8frk+B4AAAC7tIjQVjPeazNHrDpaVddW1bU333zzAiYNAABwfltEaDuZ5IETrx+Q5IOzRmytHWutXdFau+LgwYMLmDQAAMD5bRGh7U1JnjHeRfKRST7ZWvvQAr4XAABg3zswb4Sqel2Sw0kuqqqTSV6c5MIkaa29Msk1Sb45yY1JPp3ke5ZVWAAAgP1mbmhrrT11zvCW5AcWViIAAABut4jTIwEAAFgSoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOrat0FZVj6+q91XVjVX1whnDr6qqm6tqY/z7vsUXFQAAYP85MG+EqrogycuTfGOSk0neWVVvaq29d2rUX2utPXsJZQQAANi3ttPTdmWSG1tr72+tfS7JryZ54nKLBQAAQLK90HZxkg9MvD45vjftSVV1fVW9vqoeOOuLqupoVV1bVdfefPPNZ1FcAACA/WU7oa1mvNemXv96kktba1+d5HeT/NKsL2qtHWutXdFau+LgwYM7KykAAMA+tJ3QdjLJZM/ZA5J8cHKE1trHWmufHV/+QpLLF1M8AACA/W07oe2dSR5cVV9SVXdO8pQkb5ocoaruP/Hy25PcsLgiAgAA7F9z7x7ZWrutqp6d5LeSXJDk1a2191TVTyS5trX2piTPqapvT3Jbko8nuWqJZQYAANg35oa2JGmtXZPkmqn3XjTx/3+a5J8utmgAAABs6+HaAAAArIbQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADomNAGAADQMaENAACgY0IbAABAx4Q2AACAjgltAAAAHRPaAAAAOia0AQAAdExoAwAA6JjQBgAA0DGhDQAAoGNCGwAAQMeENgAAgI4JbQAAAB0T2gAAADomtAEAAHRMaAMAAOiY0AYAANAxoQ0AAKBjQhsAAEDHhDYAAICObSu0VdXjq+p9VXVjVb1wxvC7VNWvjcP/oKouXXRBAQAA9qO5oa2qLkjy8iRPSPKQJE+tqodMjfa9ST7RWvvyJP9Pkp9ZdEEBAAD2o+30tF2Z5MbW2vtba59L8qtJnjg1zhOT/NL4/9cn+YaqqsUVEwAAYH/aTmi7OMkHJl6fHN+bOU5r7bYkn0xyn0UUEAAAYD+r1trWI1R9V5K/31r7vvH105Nc2Vr7wYlx3jOOc3J8/afjOB+b+q6jSY6OL78iyfsWVREAAIBzzCWttYPzRjqwjS86meSBE68fkOSDm4xzsqoOJPnCJB+f/qLW2rEkx7YxTQAAALK90yPfmeTBVfUlVXXnJE9J8qapcd6U5Jnj/5+c5M1tXhceAAAAc83taWut3VZVz07yW0kuSPLq1tp7quonklzbWntTklcl+eWqujFDD9tTllloAACA/WLuNW0AAACszrYerg0AAMBqCG0AAAAdE9pYmKr66ap63jbHfWlV/aNll2mnzpU67KScu5zOc6rqXyzpu/eqDl9dVe9Y9nQ2mfZO2tMbq+rxyy7TjOme821pk+ktpF6LbD979VvvRlXdr6puqKq7rLossJUdrl/3dP1D/xa4jbhLVf1xVd13EeXaUmut+78kX5nkzRke2n1jkn8wMez7xvduTfKbSb54Ytj/luQt4+dumvG9P5nkRJLbkrxkwWW+S4YbtPx5kluS/GGSJ8wY78VJWpLHTbz3LzM8rPxT4+d/dGLYRUn+W5KPJfmLJP9vkkdPDH/l+Fuc/vtsklv2YB4dTPI/ktxtfP2QJNcm+cT497tJHjIx/v3HOt551e1rizo8MsnvZLi5zs1J/n2S+6+6DjPKeeckr09y09iWDk+NP285uCnJZybazG9PDLtrhkd63HfF7eUuY9v+8Dg/fj3JxRPDnz1+/rNJrp4xvWuSfNuK59Ol4/yZXD5/bGL8K5Nct+IyPm2qfJ8ey3z5OPwlSf7n1DhfOg6bt25aSlvaTr2mhs1a594lyaszrHNPJfmhRbefGb/1vPXLlm1+HOcpSW5I8pdJ/jTJY8b3560TKsnPjPPqYxm2OTUx/OeT/OAO6rbl8rfZ7z6+/7gk7xrr8IEk3z2+/5ipdnbr+PknjcOvSvLXU8MPb7fMU2XYdHud+cvtVtvrLeswVYY3j8MO7KKNLWW/Yxx+QZJ/nuGRT6e/+96Lnhe7WcaTfPe4PNyS5L1JvmNi2F6uf85qeRjL/44M693jMz7z9eOy8qkk709ydGLYj0z9/p9J8jdJLlpgvWbui2f++uY3psr2uSQnJobflE32Qfaq/WTOtm8c5xFJ3joO/3CS504M++Ek/2rp5V72BBbwwx5I8idJfmhcaXx9hpX7307y2CQfSfJVY6N5RZLfm/jslUmenuGB3jfN+O5nJnlCkv+UxYe2L8iwg3Nphh7Nbx1XJJdOjPNlGULjB6cW3K9I8gXj/y9O8p4k3zm+vus4/E4ZNrzfkWGDPnNFn+TqDHf8XPZ8ekGSX5h4fe+x7jXOt+ckuX7qM7+T5MmrbmNb1OEJSb4ryb2S3D3DTt1vrroOM8p55yTPS/K1ST6UM1eY85aDmzK1IzU1/BeS/ONVtpdxhfjuJPcbl4FfTvLGieHfOS4Lr8js0Pa0JP95xfPp0szZKUvy35Ncsaoyzhh+VYYwcPqmVS9J8iubjDt33bSMtrSTemXzde5PJ3lbki/KsGNyKsnjF9l+drp+2Uab/8YMO9ePHH/zizOGum2sE56V5H0Znrt6cYYd3H80MfzRSf5oB3Wbt/xt9rs/JMM2/AkZtvX3SfJlm0zjcIZt6Olt41VJ3r6g9rLp9nrecpstttfz6jDVvt661XR2W49tzIst65EhsL05ySUZlu+HJrnroufFLpanizOEgSeM5fuWDDve950YZ6/WP2e7PDwuQ3B7UaZCW5ILM4SlZ431+7sZwsPDNinDSzI8fmtRddpqX3zL9c2M7zqe5EUTr2/KFvsge9F+Zgy/Knfc9l2UYV31tAwHR+6Z5Csnxn9Ako8muctSy72XP9JZ/rAPHRvm5FHA387QS/Z/J3n5xPtfPK70vmzqOx6XGTurE8N/JQsObZtM5/pMHGHLcPThm7dqsOOK6ESSH54x7E5Jvm2s8xlHjzKswG9J8tg9qNubk/zvmww7kOQHknx66v0fTfKLq25j26nDOPwRmeq1XEUd5vzWJzdbYW62HMxbYY4rqbessr1k2Pj9y4nX35LkfTM++88zeyN5cYYjeUtdoW5Vx2wvtP1Ckhf30JbG4W+ZLE+2CG1Tn5u5blpGW9pJvTZb52Y44vpNE69/MsmvLrL97HT9Mq/NZzgi/73bmO4Z64Txs5NH6b83ye9PvD6QYYf3kh3WcbPlb7PffT3JT27zu39xcl2bJQeFjNvr7Sy3U+1k5vZ6Vh3G974ww87wI7c7nbOpx7x5sVU9MhzMuDWbB+qlzotNpjm9fv2aJB+ZGufmJI+aeL0n65+J6e1oeZgY/n05M7Tdb2wfd594751Jnjrj85UhcDxzgXXZdF98arxN90HG4Zdm6JX9kon3Nm2Le9V+Zgyf3vb9VJJfnvOd/z1L3t8+F65pq03ee+j4b80Y96HLLtROVdX9MhyReM/4+ruSfK61ds0m47+wqm7NsAB8QYaN2+Tw65P8VYYHm//b1tpHZnzNkzKstN66qHps4bIMR27voKr+IkM5fy5Do590Q5KHLb9o2zazDhO+LuP8m7CKOswr59l4bVXdXFW/XVXT9VlGHXfaXl6V5NFV9cVVdfcMG9/f2O7EWmv/I8NpfV+xm0Lv0Gbz6c+r6mRV/WJVXTQ1bK/b06ZtqaouydDmXzM16Nuq6uNV9Z6q+v4Zn9tq3bRX9TujXputc6vqizIc8Hv3xNvvznAGR5KFtZ+drl82bfNVdUGSK5IcrKobx/b0sqq62zbL8lXZur63ZTj9adfzas627pHjOCeq6kNV9StV9bdmfMfdkzw5yS9NDXp4VX20qv6kqn6squY+e3abZb7D9nq06XI7b3s9pw4/lSGgn1pE2aemuaj9jssyXEby5Ko6Nf7ePzD18aXMiy1ML0/XJrmhqr69qi6oqu/IcHri9RPjrHyfY9482Exr7cNJXpfke8b6PSpDr+fbZ4z+mAwh7w27Le+ErfbFd+IZSd7WWvuzqfe32gdZhp1u+x6Z5ONV9Y6q+khV/XpVPWjqo0tvX+dCaPvjDF2SL6iqC6vqmzKcFnn3DNcZfPd4ofjdMnQpt3FYN6rqwiSvTfJLrbU/rqp7ZFhRb3oBZGvtX2QnqHX+AAAKLklEQVTofn1EhtNiPjk1/KsznFZzJLMX2mQ4/fM1bTwEsGT3ztCrdwettXtnOJL47AznwE+6ZfxcL2bWIRluRpChfb1gatAq6rBpOc/S0zIc/bokw9Gl36qqyTrdkmEeLtJO28ufJPn/M/SIfCrDKWw/scNp7vW8mq7jRzOc0nJJksszLN+vnfrMqss4adbG9d9l+O0PJvk/kryoqp46+aE566ZltKVZ7lCvOevce4z/Tq5jP5lh/kza7bzZ6fplqzZ/vwynSz05ww7aWpKHJ/ln2yzLPXJmfe9RVZM7Zrtui9vY1j0gw6nbT0ry4CR3y3DAZtqTMiw/vzfx3lsz7DDedxz+1Jy5fj6bMt9he51tLLfztteb1aGqrshwKuqsOi+0Hrvc73hAhuX2byf5kgzt7iVV9Y3j8KXMiznusDy11v46w072eoawtp7kWa21v5z4zF6tf2bazjyY43UZ1hOfzXA694+21j4wY7xnJnl9a+3Ws5zOLFvti+/EMzJctjNp3j7IMux02/eADL/rc5M8KMmfZZgfk5a+/e4+tLXW/meGc4O/JcORqOdn2HE42Vr7rxku5nxDhnP7b8rwo51cSWFnqKo7ZVj5fS7DjmiS/HiGbtbpIw130AZ/mOG0nB+fMfyvWmuvS/LC6SMTVfXADAvU9FHyZflEztzJSZKMK81XJnnN1N117pnhhgW9mFmHqvryDEe4n9tae9vU4FXUYdPf+my01v5ba+0zrbVPt9Z+OkN9HjMxyj0zeydkN3baXl6R4bqe+2Q4AvzG7KCnbbTX8+oOdWyt3dpau7a1dtt41PTZSb6pqu7VSxmnPCNTvQKttfe21j7YWvvr1to7kvybDDtwmRpvs3XTMtrSLNP12mqde3rHZnI+3CtnbtB3O292un7Zqs1/Zvz351prH2qtfTTJSzOccrUdt+bM+t46dYBvEW1x3rbuMxlOF/yTcQfzpzK7DmccgGytvb+19mettb9prZ3IEGjPaIs7MWt7vc3ldu72eroO47R+PsN8v2035d5OPbK7/Y7T7e0nxm3F9Ul+NeO8Wsa82IY7LE9V9bgMN1M5nOEaq8cm+bdVtTbxmb1a/2xmW/Nglqr6O0l+LcN6+c4ZesZ/uKq+ZWq8u2W4Vna6R3dXttoX3+53VNXXJjmU4aYlk989bx9kGXa07cuwDPyH1to7W2t/lWFe/r2qmjwIsPTtd/ehLUlaa9e31h7bWrtPa+3vJ/nSJP/fOOzlrbUHt9bumyG8HUjyRyss7u3Go5avynBU9Eljo0+Sb0jynPE0g1NJHpjk31XVP9nkqw5kuHB1Mxdm+E0mPSPJO1pr7z/rCuzM9RmOwm3mThmOyFw88d5X5o6n6KzaGXUYu8l/N8N527884zOrqMO833q3Wu54KsQy6rjT9vKwDNcGfLy19tkMR6avnHF64UxV9cUZNnSLPq10K/PqeHoHdNm/9VZmlrGqHp3hlMHXn/GJO5puK9Om1017Vb/pem26zm2tfSLDxfOT4fJhmTg1bkHtZ6frl03b/Fjmk/l8G9qp92Tr+h5I8uXZ/byat627PnPqMB6APJz5ByDntcUtbbG9njWdbDGtM7bXm9ThXhlOcf218bd55/j+yao66x3WJe13nD7FcLvtbVfzYpuml6e1JG8dA/bftNbemeQPMlzLfdqq9zl2Og8mPTTDNa2/NdbvfUn+S4Ybr0z6zgw3gDq+wHIn2XpffJuemeFmSvN6AFfRfpJsue2bXletZPt9ToS28fTHu1bV3avqH2e41frV43sPrcGDkhxL8m/GDVqq6k5VddcMOw41jn/nie+9cBx+pyQHxuEXLLDor8gwE7+ttfaZife/IcMCuDb+fTDDHYFePpb5WVX1RWO9rsxwU4b/Opb5kVX1tVV156q627iw3y/DymnSrC7oZbomw5GtJElVfWNVPXw89/peGY4CfyLDOb+nPTY77y1Zpuk6XJzhYtWXt9ZeuclnVlGHO5Qzuf05IXcdX955bMs1Dtt0OaiqB1XVo8f2dNeqekE+f+v205ZRx522l3cmeUZVfeF42s//meSDYw9DqurAWMcLklww1mXymorDGe6k9dkF12Mr03X8mqr6inF+3CfJz2a42HzyyO9et6cz2tLomUne0Fq7Q29TVT1xat30nAx3393uummv6jddr03XuePw1yT5Z2Pd/k6GUz+vnvj84ey+/ex0/bJlm89wU4sfrKr71nBd3vOS/OeJ7990nTDW94eq6uIxkD5/qr5XZrhp0Z9vp2JbLH/zfvdfzHCNzpfWcM3XP5msw+jpGQ5A/unUNJ9QwzVbp3shfixjWzxLM7fXWy2387bXc+rwyQw7h6d/m9M9jJfnzO35ruuRXex3jOV+W5IfHdvVVyb5hxnn1RLmxXZML+PvTPKYGnvWqurhGXprJq9p25P1z9kuD+P2764ZAvOdxs9dOH7tHyZ5cFV9/TiPvizD3UGnQ8LSLoupTfbFx2FbrW8mewCvnvrO7eyDLMOOtn0Z1lX/oKrWxnnyYxluvvMXye3r87+V5PeXWOb+7x45trv/K8MO3K0ZFrgvH9+/d4YF8i8zdNf+dJILJj53OEManvw7PjH86hnDr1pQmS8Zv++vcsdnPzxtxrg3ZbxzToYA+ZsZjpTcmuG6hh/J5287+tgMC+kt4zi/l+Trpr7vUeNvcs89nEcXZTjye/qZF9+V4RzoWzPcDOWaJF89Mf79x/F7ek7bdB1enDOfz3PrquswXc6JNjTdli+dtxxkOMXi9DL0sQwb6Ssmvvf0s23ut+L2cp8M12d8JMPpB29PcuXE8JfMqONLJob/lyTfvuL29NQM58H/ZYaendckOTQx/t9N8ocdtKW7jr/xN8wY/3VjO7l1nF/PmRi25bppWW1pu/WaGn5TNn9O24dz5nPadt1+ZrSHeeuXeW3+wgyn1/1Fhu3fz2a8BftEHTdbJ1SGU8k+Pv5NP6ft5ZPzdht123L52+x3H9/78QzL/M0ZTun7oqnhf5wZd8nMcPfoD4/L0/sznJJ34VnOm02319liuc2c7fW8OkyNc2l2f8v/pex3jONcPI5z6/h7P2sZ8+Jsl6fxvWdnuIHOLWM5nj8xbC/XP2e1PGS4C+f0566eGP7dGc4kO30Z0M8kudPUPLot4z7yEuo1c198oi4z1zfj8KdmuIxpetnYch9kj9vPptu+cfj3Z7jG+BMZnpv5wIlhL0jy0mWX+3QQgF2rqp/KcMvdf72Ncf9Vkj9trf388ku2fedKHXZSzl1O5wczrJh+eAnfvVd1uCzJsdbao5Y5nU2mvZP29IYkr2o7vKvYbp0PbWmT6S2kXotsP3v1W+9GDdeR/l6Sh7fh2g3o0g7Xr3u6/qF/C9xG3CXDAcuva7Pv5L4wQhsAAEDHzolr2gAAAPYroQ0AAKBjQhsAAEDHhDYAAICOCW0AAAAdE9oAAAA6JrQBAAB0TGgDAADo2P8Ca/rE8lmC+h0AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 1080x720 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize = (15, 10))\n",
    "row_clusters = linkage(X_std, method=\"complete\", metric=\"euclidean\")\n",
    "f = dendrogram(row_clusters, p = 5, truncate_mode=\"level\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
